### Imports
#### Can't go anywhere without those

In [None]:
# Data manipulation/analysis libraries
import dtale
import numpy as np
import pandas as pd
# Plotting/visual ibraries
%matplotlib inline
import seaborn as sns
color_pal = sns.color_palette()
from matplotlib import pyplot as plt
plt.style.use('fivethirtyeight')
from IPython.display import clear_output
# Models/performance/metrics
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

### Group and Resample
#### Simple logic to create the main feature, which will be our y, or our target value. 

In [None]:
"""Cleaning/grouping/resampling function to create generically useable data"""
def groupby_hour(base_df):
  base_df['ts'] = pd.to_datetime(base_df['ts'])
  # Parse the time to remove all unwanted values
  base_df['ts'] = base_df['ts'].dt.strftime('%Y-%m-%d %H:%M')
  # Create a new frame to group the unique values by size
  df = base_df.groupby(['ts']).size()
  # Reassign the new frame to create the new column
  df = base_df.join(df.to_frame(), on='ts')
  # Drop all duplicate entries, leaving only the unique entries
  df = df.drop_duplicates(subset=['ts'])
  # New df to not have to list all columns to drop
  zeek_df = df[['ts', 0]]
  # Rename the column
  zeek_df = zeek_df.rename(columns={0 : 'log_counts'})
  # Set the index to the ts column
  zeek_df = zeek_df.set_index('ts')
  # Set index to a type of datetime
  zeek_df.index = pd.to_datetime(zeek_df.index)
  # Finally, group by hour
  zeek_df = zeek_df.resample('H').sum()
  # Return newly formatted df
  return zeek_df

### Feature Creation
#### Using Pandas datetime functions (thank you pandas) for new features to feed to the model. 

In [None]:
"""Function to create specific features for the model"""
def create_features(df):
  # Call the groupby_hour function
  df = groupby_hour(df)
  # Make month column
  df['month'] = df.index.month
  # Make day column
  df['day'] = df.index.day
  # Make hour column
  df['hour'] = df.index.hour
  # Make day of year column
  df['day_of_year'] = df.index.day_of_year
  # Return df
  return df

### Lag Features
#### What was the value of our target (x) days in the past

In [None]:
# Init function
def add_lags(df):
  # Call the create features function
  df = create_features(df)
  # Create a target map
  target_map = df['log_counts'].to_dict()
  # Create lag feature for 1 day in the past
  df['lag1'] = (df.index - pd.Timedelta('1 day')).map(target_map)
  # Create lag feature for 7 days in the past
  df['lag7'] = (df.index - pd.Timedelta('7 days')).map(target_map)
  # Create lag feature for 14 days in the past
  df['lag14'] = (df.index - pd.Timedelta('14 days')).map(target_map)
  # Return finished function
  return df

### Import the data
#### This notebook will function a bit differently from the 5 minute model, as in everything from here on will be more experimental, to include visualizations for confirmation/comparison

In [None]:
# Read the csv
df = pd.read_csv('')
# Call the function and re-declare the df we already created
df = add_lags(df)

### Assign features to X and the target to y
#### Using SKlearns train_test_split's convenience, I am going to rearrange the columns, create X and y values from columns based off the index location, and then create the train/test split variables to feed our model with

In [1]:
# Rearrange columns
df = df[['month', 'day', 'day_of_year', 'hour', 'lag1', 'lag7', 'lag14', 'log_counts']]
# Create features columns later
FEATURES = ['month', 'day', 'day_of_year', 'hour', 'lag1', 'lag7', 'lag14']
# X Value to get everything but the last column in the DF
X = df.iloc[:, :-1]
# y value to get only the last column in the DF
y = df.iloc[:, -1]
# Using SKlearns train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
# Create the model
reg = xgb.XGBRegressor(base_score=0.5,
                       booster='gbtree',    
                       n_estimators=100,
                       objective='reg:squarederror',
                       max_depth=3,
                       learning_rate=0.01,
                       colsample_bytree=0.7,
                       colsample_bylevel=0.4,
                       subsample=0.89)
# Fit the model with our train/test data  
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100)

In [None]:
# This is definitely useful, but not entirely necessary, as it is showing us the most useful features for the model
fi = pd.DataFrame(data=reg.feature_importances_,
             index=reg.feature_names_in_,
             columns=['importance'])
# Creating feature importance plot  
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
# Present the plot
plt.show()

In [None]:
# Create a future date range to predict on
future = pd.date_range('2022-12-01 00:00:00', '2022-12-31 23:00:00', freq='1h')
# Create a future df with the date range as index
future_hourly_df = pd.DataFrame(index=future)
# Create a boolean based series set to True prior to concatting with existing data
future_hourly_df['isFuture'] = True
# New boolean feature set to False for future querying
df['isFuture'] = False
# Concat the two frames
df_and_future = pd.concat([df, future_hourly_df])
# Calling the create features function for fresh features
df_and_future = create_features(df_and_future)
# Calling the add_lags function for fresh features
df_and_future = add_lags(df_and_future)
# Creating a new df with only True isFuture values by querying
future_and_features = df_and_future.query('isFuture').copy()
# Making a new column for predictions and predicting for those values
future_and_features['pred'] = reg.predict(future_and_features[FEATURES])
# Reassigning the predictions column to compare to historical values
df_and_future['pred'] = future_and_features['pred']
# Confirm it works
df_and_future

In [None]:
# Before comparing, lets plot to get a glimpse of how the predictions look
future_and_features['pred'].plot(figsize=(10, 5),
                               color=color_pal[4],
                               ms=1,
                               lw=1,
                               title='Future Predictions')
# Plot the visualization 
plt.show()

In [None]:
# Plotting defaults
fig, ax = plt.subplots(figsize=(15, 5))
# Plot pre-existing data
df['log_counts'].plot(ax=ax, label='Pre-Existing Data', title='Prediction with Pre-Existing Data')
# Plot predicted data
future_and_features['pred'].plot(ax=ax, label='Prediction')
# Set the plot legend
ax.legend(['Existing Data', 'Prediction Data'])
# Show plot
plt.show()

### Hyper-parameter tuning
#### This is some simple logic that allows me to hyper-tune the parameters of the model to enhance the results, and prevent overfitting. This isn't a part of the regular logic, as this has already been used to tune this model. However, it is absolutely useful and worth keeping handy. 

In [None]:
"""X and y values for every hour"""
# Create dataset specific features
FEATURES = ['month', 'day', 'hour', 'lag1', 'lag7', 'lag14']
# Create a value for the target
TARGET = 'log_counts'

# X value with the features
X = df[FEATURES]
# y value with the target
y = df[TARGET]

In [None]:
# Set a list of each parameter to test
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}
# Create the normal model 
xgbr = xgb.XGBRegressor(seed = 20)
# Create the grid-search and assign the model and params to it
clf = GridSearchCV(estimator=xgbr, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=1)
# Fit the GridSearchCV  
clf.fit(X, y)
# Print best parameters
print("Best parameters:", clf.best_params_)
# Print lowest root mean squared error
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))