# Model Building and Selection

In [33]:
import os
import time
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from src.cslib import fetch_ts, engineer_features

In [34]:
# Get time series data from source training data files
data_dir = os.path.join("data","cs-train")
# Get time series data frames
time_series_dfs = fetch_ts(data_dir, clean=False)

... loading ts data from files


In [35]:
# Get features from time series data
X,y,dates = engineer_features(time_series_dfs['all'])
        
# Get train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42)

In [36]:
# Initialize start time
time_start = time.time()

# Create pipeline with Random Forest estimator
pipeline_rf = Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

# Initialize grid search parameters
param_grid_rf = {
    'rf__criterion': ['mse','mae'],
    'rf__n_estimators': [10,15,20,25]
    }
# Create grid search for Random Forest model
grid = GridSearchCV(pipeline_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1)
# Fit the model with training data
grid.fit(X_train, y_train)
# Get model predictions
y_pred = grid.predict(X_test)

# Get mean absolute error
mae_rf = mean_absolute_error(y_test, y_pred)
# Get mean squared error
mse_rf = mean_squared_error(y_test, y_pred)
# Get r2 score
r2_score_rf = r2_score(y_test, y_pred)

# Initialize end time
time_end = time.time()

# Display results
print("Model: Random Forest")
print("r2_score: {:.3f}".format(r2_score_rf))
print("Mean absolute error: {:.0f}".format(mae_rf))
print("Mean squared error: {:.0f}".format(mse_rf))
print("Grid best params: ", grid.best_params_)
print("Train time: ", time.strftime('%H:%M:%S', time.gmtime(time_end-time_start)))

Model: Random Forest
r2_score: 0.932
Mean absolute error: 14689
Mean squared error: 443408770
Grid best params:  {'rf__criterion': 'mae', 'rf__n_estimators': 20}
Train time:  00:00:04


In [37]:
# Initialize start time
time_start = time.time()

# Create pipeline with Decision Tree estimator
pipeline_dt = Pipeline(steps=[('scaler', StandardScaler()), ('dt', DecisionTreeRegressor())])

# Initialize grid search parameters
param_grid_dt = {
    'dt__criterion': ['mse','mae'],
    'dt__max_depth': [5,10,20,50],
    'dt__min_samples_leaf': [1,2,3,4,5]
    }
# Create grid search for Decision Tree model
grid = GridSearchCV(pipeline_dt, param_grid=param_grid_dt, cv=5, iid=False, n_jobs=-1)
# Fit the model with training data
grid.fit(X_train, y_train)
# Get model predictions
y_pred = grid.predict(X_test)

# Get mean absolute error
mae_dt = mean_absolute_error(y_test, y_pred)
# Get mean squared error
mse_dt = mean_squared_error(y_test, y_pred)
# Get r2 score
r2_score_dt = r2_score(y_test, y_pred)

# Initialize end time
time_end = time.time()

# Display results
print("Model: Decision Tree")
print("r2_score: {:.3f}".format(r2_score_dt))
print("Mean absolute error: {:.0f}".format(mae_dt))
print("Mean squared error: {:.0f}".format(mse_dt))
print("Grid best params: ", grid.best_params_)
print("Train time: ", time.strftime('%H:%M:%S', time.gmtime(time_end-time_start)))

Model: Decision Tree
r2_score: 0.931
Mean absolute error: 12665
Mean squared error: 450231606
Grid best params:  {'dt__criterion': 'mse', 'dt__max_depth': 10, 'dt__min_samples_leaf': 2}
Train time:  00:00:00


In [38]:
# Initialize start time
time_start = time.time()

# Create pipeline with Gradient Boosting estimator
pipeline_gb = Pipeline(steps=[('scaler', StandardScaler()), ('gb', GradientBoostingRegressor())])

# Initialize grid search parameters
param_grid_gb = {
    'gb__criterion': ['mse','mae'],
    'gb__n_estimators': [10,15,20,25]
    }
# Create grid search
grid = GridSearchCV(pipeline_gb, param_grid=param_grid_gb, cv=5, iid=False, n_jobs=-1)
# Fit the model with training data
grid.fit(X_train, y_train)
# Get model predictions
y_pred = grid.predict(X_test)

# Get mean absolute error
mae_gb = mean_absolute_error(y_test, y_pred)
# Get mean squared error
mse_gb = mean_squared_error(y_test, y_pred)
# Get r2 score
r2_score_gb = r2_score(y_test, y_pred)

# Initialize end time
time_end = time.time()

# Display results
print("Model: Gradient Boosting")
print("r2_score: {:.3f}".format(r2_score_gb))
print("Mean absolute error: {:.0f}".format(mae_gb))
print("Mean squared error: {:.0f}".format(mse_gb))
print("Grid best params: ", grid.best_params_)
print("Train time: ", time.strftime('%H:%M:%S', time.gmtime(time_end-time_start)))

Model: Gradient Boosting
r2_score: 0.861
Mean absolute error: 24178
Mean squared error: 910075471
Grid best params:  {'gb__criterion': 'mse', 'gb__n_estimators': 25}
Train time:  00:00:01


# Summarize Findings

**Model Building and Selection:**
<br>The following models were built and tested:
* Random Forest
* Decision Tree
* Gradient Boosting

<br>The following performance parameters were evaluated:
* R-squared score
* Mean absolute error
* Mean squared error
* Train time

**Findings:**
* Random Forest model was the top performer