In [1]:
from google.colab import drive  
drive.mount('/content/drive')  

Mounted at /content/drive


In [None]:
# File saved on google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import io
#os.chdir("/content/drive/MyDrive/Colab Notebooks/Data/")
os.chdir("/content/drive/MyDrive/")

# Load Packages

In [None]:
import numpy as np
import pandas as pd

# To make this notebook's output stable across runs
np.random.seed(123)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve, plot_roc_curve, confusion_matrix, accuracy_score

from sklearn.pipeline import Pipeline
from scipy.stats import reciprocal, uniform, randint

# Functions

In [None]:
# numerical correlation
def plot_pearson_correlation(df, cols, low_thres, high_thres):
    # customized
    
    corr_df = df[cols].corr()
    # Get only half portion of corr_df to avoid df, so create mask    
    mask = np.triu(np.ones_like(corr_df), k=1)
    
    # plot a heatmap of the values
    plt.figure(figsize=(10,10))
    ax = sns.heatmap(corr_df, vmin=-1, vmax=1, cbar=False,
                     cmap='coolwarm', mask=mask, annot=True)
    
    # format the text in the plot to make it easier to read
    for text in ax.texts:
        t = float(text.get_text())
        if low_thres < t < high_thres:
            text.set_text('')
        else:
            text.set_text(round(t, 2))
        text.set_fontsize('x-large')
    plt.xticks( size='x-large')
    plt.yticks(rotation=0, size='x-large')
    # Save fig uncomment first
    # plt.savefig("Heatmap DF")
    plt.show()
    return corr_df

In [None]:
def get_top_features(model, n):
  feature_names = list(num_var) + list(cat_one_hot)
  feature_imp = model.feature_importances_
  zipped = zip (feature_names, feature_imp)
  sorted_feature = sorted(zipped, key = lambda t: t[1], reverse=True)
  return sorted_feature[:n]

In [None]:
station = pd.read_csv("Project/data/Raw_Data/station_info_w_POI.csv")

In [None]:
station.head()

# Load and Prepare Data for Training

In [None]:
data_path = "Project/data/final_trip_feature_data.csv"
#df = pd.read_csv('final_trip_data_with_distance_duration.csv', index_col=[0], parse_dates=['Start Time'], date_parser=dateparse)
df = pd.read_csv(data_path)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
plot_pearson_correlation(df, df.columns, -0.25, 0.25)

In [None]:
# df.loc[(df['Net Demand']<=-80)]

In [None]:
# df[['Net Demand']].describe()

In [None]:
# df['Net Demand'].quantile(0.9)

In [None]:
# plt.figure(figsize=(20,20))
# df[['Net Demand']].boxplot()

In [None]:
cols_to_drop = ['Start Station Id', 'Year',  'Day of Month'] #'Month',
target = ['Positive Net Demand']

In [None]:
target = ['Positive Net Demand']

In [None]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

# Modeling

In [None]:
X_train = df_train[[col for col in df.columns if col not in cols_to_drop+target]]
y_train = df_train[target]

In [None]:
X_train.info()

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
cat_var = X_train.select_dtypes(include=['object']).columns

In [None]:
num_var = [col for col in X_train.columns if col not in cat_var]

In [None]:
from sklearn.compose import ColumnTransformer
full_pipeline=ColumnTransformer([("Num", StandardScaler(), num_var),
                                 ("Cat", OneHotEncoder(), cat_var),                              
                                 ])

In [None]:
X_train_prepared=full_pipeline.fit_transform(X_train)

In [None]:
cat_one_hot = full_pipeline.transformers_[1][1].get_feature_names()

In [None]:
len(cat_one_hot)

In [None]:
X_train_prepared.shape

In [None]:
import scipy
scipy.sparse.issparse(X_train_prepared)

In [None]:
X_train_prepared=X_train_prepared.toarray()

## KNN 

In [None]:
KN_clf = KNeighborsClassifier(n_neighbors= 10)

#KN_param_grid= [{'n_neighbors':[10,50,100]}]
KN_clf.fit(X_train_prepared, np.ravel(y_train))

KNeighborsClassifier(n_neighbors=10)

In [None]:
y_pred = KN_clf.predict(X_train_prepared)

In [None]:
accuracy_score(y_train, y_pred)

In [None]:
print("model score: %.3f" % KN_clf.score(X_train_prepared, y_train))

## Decision Tree

In [None]:
# benchmark model using default hyperparameters
tree_clf = DecisionTreeClassifier(random_state=42)
#y_train_pred = cross_val_predict(tree_rg, X_train_prepared, y_train, cv=4)
scores = cross_val_score(tree_clf, X_train_prepared, y_train, cv=4, scoring = 'roc_auc')

In [None]:
scores.mean()

0.5653185541183208

#### Decision Tree Tuning - Individual Params

In [None]:
# tuning hyperparameters
param_1 = {'criterion': ['gini','entropy']}
param_2 = {'max_depth': [None,500,100,50,10]}
param_3 = {'min_samples_split': [2,100,500,1000]}
param_4 = {'min_samples_leaf': [1,10,50,100,500]}
param_5 = {'max_leaf_nodes': [None,100,50,10,5,2]}        

In [None]:
tree_clf_1 = GridSearchCV(tree_clf, param_1, cv=4, scoring='roc_auc')
tree_clf_1.fit(X_train_prepared, y_train)
score_1 = tree_clf_1.cv_results_['mean_test_score']

In [None]:
print(param_1)
print(score_1)

{'criterion': ['gini', 'entropy']}
[0.56531855 0.56585914]


In [None]:
tree_clf_2 = GridSearchCV(tree_clf, param_2, cv=4, scoring='roc_auc')
tree_clf_2.fit(X_train_prepared, y_train)
score_2 = tree_clf_2.cv_results_['mean_test_score']

In [None]:
print(param_2)
print(score_2)

{'max_depth': [None, 500, 100, 50, 10]}
[0.56531855 0.56531855 0.56531855 0.56665279 0.63413672]


In [None]:
tree_clf_3 = GridSearchCV(tree_clf, param_3, cv=4, scoring='roc_auc')
tree_clf_3.fit(X_train_prepared, y_train)
score_3 = tree_clf_3.cv_results_['mean_test_score']

In [None]:
print(param_3)
print(score_3)

{'min_samples_split': [2, 100, 500, 1000]}
[0.56531855 0.65367069 0.67836148 0.67806547]


In [None]:
tree_clf_4 = GridSearchCV(tree_clf, param_4, cv=4, scoring='roc_auc')
tree_clf_4.fit(X_train_prepared, y_train)
score_4 = tree_clf_4.cv_results_['mean_test_score']

In [None]:
print(param_4)
print(score_4)

{'min_samples_leaf': [1, 10, 50, 100, 500]}
[0.56531855 0.62950871 0.66978317 0.67799734 0.66852537]


In [None]:
tree_clf_5 = GridSearchCV(tree_clf, param_5, cv=4, scoring='roc_auc')
tree_clf_5.fit(X_train_prepared, y_train)
score_5 = tree_clf_5.cv_results_['mean_test_score']

In [None]:
print(param_5)
print(score_5)

{'max_leaf_nodes': [None, 100, 50, 10, 5, 2]}
[0.56531855 0.63188022 0.61708118 0.57450919 0.56727609 0.52082953]


#### Decision Tree Tuning - All parameters

In [None]:
hyperparamters_2 = {'criterion': ['gini','entropy'],
                     'max_depth': randint(1,50),
                    'min_samples_split': randint(100,1000),
                    'min_samples_leaf': randint(50,500),
                    'max_leaf_nodes': randint(50,500)}

In [None]:
tree_clf_rand = RandomizedSearchCV(tree_clf, hyperparamters_2, cv=4, scoring='roc_auc')
tree_clf_rand.fit(X_train_prepared, y_train)

RandomizedSearchCV(cv=4, estimator=DecisionTreeClassifier(random_state=42),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b1d10ee50>,
                                        'max_leaf_nodes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b167d9450>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b167d98d0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b1697e890>},
                   scoring='roc_auc')

In [None]:
print(tree_clf_rand.best_params_)
print(tree_clf_rand.best_score_)

{'criterion': 'gini', 'max_depth': 46, 'max_leaf_nodes': 432, 'min_samples_leaf': 372, 'min_samples_split': 198}
0.6608666122654997


In [None]:
final_tree_clf = tree_clf_rand.best_estimator_

In [None]:
get_top_features(final_tree_clf,40)

[('POI_distance', 0.32843471763596255),
 ('capacity', 0.07844351187810701),
 ('x0_10AM-2PM', 0.06395060549175349),
 ("x1_Queen's Park", 0.06240643099759838),
 ('x0_6AM-10AM', 0.035661942606273235),
 ('x1_King', 0.034892834722501175),
 ('x1_St Patrick', 0.03489025487223086),
 ('Is_Weekend', 0.02934168684584401),
 ('x1_Dundas', 0.029169843044116912),
 ('x1_St Andrew', 0.02750305571796595),
 ('x1_Queen', 0.02338513629720263),
 ('x1_Osgoode', 0.022914425264749582),
 ('x1_Union Station', 0.02242465349807036),
 ('x0_2PM-6PM', 0.01952159383615947),
 ('x0_2AM-6AM', 0.016365613879997184),
 ('x1_Wellesley', 0.015250336644851713),
 ('x1_Museum Station', 0.015190404147962772),
 ('x1_Sherbourne', 0.014961305167813434),
 ('x1_Bay', 0.011986140533208607),
 ('x1_Exhibition GO', 0.009987969713816795),
 ('x1_Dundas West', 0.007379716589652548),
 ('x1_St. Clair West', 0.007316795912558427),
 ('x1_Lawrence', 0.007094135850992359),
 ('x1_Jane', 0.006680498051827914),
 ('x1_College Station', 0.0066760838416

### Testing with final decision tree model

In [None]:
X_test = df_test[[col for col in df.columns if col not in cols_to_drop+target]]
y_test = df_test[target]

In [None]:

X_test_prepared=full_pipeline.transform(X_test)
y_test_pred = final_tree_clf.predict_proba(X_test_prepared)
roc_auc_score(y_test, y_test_pred[:,1])

0.6619251848050339

## AdaBoost Tree

In [None]:
# benchmark model using default hyperparameters
tree_clf = AdaBoostClassifier(random_state=42)
#y_train_pred = cross_val_predict(tree_rg, X_train_prepared, y_train, cv=4)


In [None]:
tree_clf.fit(X_train_prepared, np.ravel(y_train))

AdaBoostClassifier(random_state=42)

In [None]:
y_pred = tree_clf.predict(X_train_prepared)

In [None]:
accuracy_score(y_train, y_pred)

0.5464911638552453

In [None]:
scores = cross_val_score(tree_clf, X_train_prepared, np.ravel(y_train), cv=4, scoring = 'roc_auc')

In [None]:
scores.mean()

0.5681538519499965

#### Decision Tree Tuning - Individual Params

In [None]:
# tuning hyperparameters
param_1 = {'n_estimators': [1,10,50,100,500]}
param_2 = {'learning_rate': [None,0.1,0.2,0.5,0.9]}
param_3 = {'algorithm': ['SAMME', 'SAMME.R']}


In [None]:
tree_clf.get_params().keys()

dict_keys(['algorithm', 'base_estimator', 'learning_rate', 'n_estimators', 'random_state'])

In [None]:
tree_clf_1 = GridSearchCV(tree_clf, param_1, cv=4, scoring='roc_auc')
tree_clf_1.fit(X_train_prepared, np.ravel(y_train))
score_1 = tree_clf_1.cv_results_['mean_test_score']

In [None]:
print(param_1)
print(score_1)

{'n_estimators': [1, 10, 50, 100, 500]}
[0.52082953 0.55876547 0.56815385 0.56956138 0.57228095]


In [None]:
print(1)

1


In [None]:
tree_clf_2 = GridSearchCV(tree_clf, param_2, cv=4, scoring='roc_auc')
tree_clf_2.fit(X_train_prepared, np.ravel(y_train))
score_2 = tree_clf_2.cv_results_['mean_test_score']

4 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_weight_boosting.py", line 486, in fit
    return super().fit(X, y, sample_weight)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_weight_boosting.py", line 113, in fit
    if self.learning_rate <= 0:
TypeError: '<=' not supported between instances of 'NoneType' and 'int'



In [None]:
print(param_2)
print(score_2)

{'learning_rate': [None, 0.1, 0.2, 0.5, 0.9]}
[       nan 0.56122344 0.56335246 0.56660822 0.56807602]


In [None]:
tree_clf_3 = GridSearchCV(tree_clf, param_3, cv=4, scoring='roc_auc')
tree_clf_3.fit(X_train_prepared, np.ravel(y_train))
score_3 = tree_clf_3.cv_results_['mean_test_score']

In [None]:
print(param_3)
print(score_3)

{'algorithm': ['SAMME', 'SAMME.R']}
[0.55926638 0.56815385]


In [None]:
tree_clf_4 = GridSearchCV(tree_clf, param_4, cv=4, scoring='roc_auc')
tree_clf_4.fit(X_train_prepared, y_train)
score_4 = tree_clf_4.cv_results_['mean_test_score']

In [None]:
print(param_4)
print(score_4)

{'min_samples_leaf': [1, 10, 50, 100, 500]}
[0.56531855 0.62950871 0.66978317 0.67799734 0.66852537]


In [None]:
tree_clf_5 = GridSearchCV(tree_clf, param_5, cv=4, scoring='roc_auc')
tree_clf_5.fit(X_train_prepared, y_train)
score_5 = tree_clf_5.cv_results_['mean_test_score']

In [None]:
print(param_5)
print(score_5)

{'max_leaf_nodes': [None, 100, 50, 10, 5, 2]}
[0.56531855 0.63188022 0.61708118 0.57450919 0.56727609 0.52082953]


### Ada Boost - With Decision Tree

In [None]:
# Based off Siqi's Decision Tree Classifier training
base_tree_clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 46, max_leaf_nodes = 432, min_samples_leaf = 372, min_samples_split = 198)
ada_tree_clf = AdaBoostClassifier(base_estimator = base_tree_clf, random_state=42, algorithm='SAMME.R')

In [None]:
# tuning hyperparameters
param_1 = {'n_estimators': [500, 1000, 1500]}
# param_2 = {'learning_rate': [None,0.1,0.2,0.5,0.9]}
# param_3 = {'algorithm': ['SAMME', 'SAMME.R']}


In [None]:
ada_tree_clf_1 = GridSearchCV(ada_tree_clf, param_1, cv=4, scoring='roc_auc')
ada_tree_clf_1.fit(X_train_prepared, np.ravel(y_train))
score_1 = ada_tree_clf_1.cv_results_['mean_test_score']

NameError: ignored

In [None]:
print(param_1)
print(score_1)

#### Decision Tree Tuning - All parameters

In [None]:
hyperparamters_2 = {'criterion': ['gini','entropy'],
                     'max_depth': randint(1,50),
                    'min_samples_split': randint(100,1000),
                    'min_samples_leaf': randint(50,500),
                    'max_leaf_nodes': randint(50,500)}

In [None]:
tree_clf_rand = RandomizedSearchCV(tree_clf, hyperparamters_2, cv=4, scoring='roc_auc')
tree_clf_rand.fit(X_train_prepared, y_train)

RandomizedSearchCV(cv=4, estimator=DecisionTreeClassifier(random_state=42),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b1d10ee50>,
                                        'max_leaf_nodes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b167d9450>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b167d98d0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3b1697e890>},
                   scoring='roc_auc')

In [None]:
print(tree_clf_rand.best_params_)
print(tree_clf_rand.best_score_)

{'criterion': 'gini', 'max_depth': 46, 'max_leaf_nodes': 432, 'min_samples_leaf': 372, 'min_samples_split': 198}
0.6608666122654997


In [None]:
final_tree_clf = tree_clf_rand.best_estimator_

In [None]:
get_top_features(final_tree_clf,40)

[('POI_distance', 0.32843471763596255),
 ('capacity', 0.07844351187810701),
 ('x0_10AM-2PM', 0.06395060549175349),
 ("x1_Queen's Park", 0.06240643099759838),
 ('x0_6AM-10AM', 0.035661942606273235),
 ('x1_King', 0.034892834722501175),
 ('x1_St Patrick', 0.03489025487223086),
 ('Is_Weekend', 0.02934168684584401),
 ('x1_Dundas', 0.029169843044116912),
 ('x1_St Andrew', 0.02750305571796595),
 ('x1_Queen', 0.02338513629720263),
 ('x1_Osgoode', 0.022914425264749582),
 ('x1_Union Station', 0.02242465349807036),
 ('x0_2PM-6PM', 0.01952159383615947),
 ('x0_2AM-6AM', 0.016365613879997184),
 ('x1_Wellesley', 0.015250336644851713),
 ('x1_Museum Station', 0.015190404147962772),
 ('x1_Sherbourne', 0.014961305167813434),
 ('x1_Bay', 0.011986140533208607),
 ('x1_Exhibition GO', 0.009987969713816795),
 ('x1_Dundas West', 0.007379716589652548),
 ('x1_St. Clair West', 0.007316795912558427),
 ('x1_Lawrence', 0.007094135850992359),
 ('x1_Jane', 0.006680498051827914),
 ('x1_College Station', 0.0066760838416

### Testing with final decision tree model

In [None]:
X_test = df_test[[col for col in df.columns if col not in cols_to_drop+target]]
y_test = df_test[target]

In [None]:

X_test_prepared=full_pipeline.transform(X_test)
y_test_pred = final_tree_clf.predict_proba(X_test_prepared)
roc_auc_score(y_test, y_test_pred[:,1])

0.6619251848050339



```
# This is formatted as code
```

## Decision Tree (Regression) -NOT USED-

In [None]:
# benchmark model using default hyperparameters
tree_clf = DecisionTreeClassifier(random_state=42)
#y_train_pred = cross_val_predict(tree_rg, X_train_prepared, y_train, cv=4)
scores = cross_val_score(tree_clf, X_train_prepared, y_train, cv=4, scoring = 'roc_auc')

KeyboardInterrupt: ignored

In [None]:
scores.mean()

0.5639227458187611

In [None]:
# benchmark model using default hyperparameters
tree_rg = DecisionTreeRegressor(random_state=42)
#y_train_pred = cross_val_predict(tree_rg, X_train_prepared, y_train, cv=4)
scores = cross_val_score(tree_rg, X_train_prepared, y_train, cv=4, scoring = 'neg_mean_squared_error')

In [None]:
np.sqrt(-scores).mean()

0.6539123818861498

#### Decision Tree Tuning - Individual Params

In [None]:
# tuning hyperparameters
param_1 = {'criterion': ['squared_error', 'poisson']}
param_2 = {'max_depth': [None,100,50,10]}
param_3 = {'min_samples_split': [2,100,500,1000]}
param_4 = {'min_samples_leaf': [1,10,50,100,500]}
param_5 = {'max_leaf_nodes': [None,100,50,10,5,2]}        

In [None]:
tree_1 = GridSearchCV(tree_rg, param_1, cv=4, scoring='neg_mean_squared_error')
tree_1.fit(X_train_prepared, y_train)
score_1 = tree_1.cv_results_['mean_test_score']

4 fits failed out of a total of 8.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 1320, in fit
    X_idx_sorted=X_idx_sorted,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 179, in fit
    "Some value(s) of y are negative which is"
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.



In [None]:
print(param_1)
print(score_1)

{'criterion': ['squared_error', 'poisson']}
[-9.76318448         nan]


In [None]:
tree_2 = GridSearchCV(tree_rg, param_2, cv=4, scoring='neg_mean_squared_error')
tree_2.fit(X_train_prepared, y_train)
score_2 = tree_2.cv_results_['mean_test_score']

In [None]:
print(param_2)
print(score_2)

{'max_depth': [None, 100, 50, 10]}
[-9.77339682 -9.77339682 -9.61663195 -5.38794306]


In [None]:
tree_3 = GridSearchCV(tree_rg, param_3, cv=4, scoring='neg_mean_squared_error')
tree_3.fit(X_train_prepared, y_train)
score_3 = tree_3.cv_results_['mean_test_score']

In [None]:
print(param_3)
print(score_3)

{'min_samples_split': [2, 100, 500, 1000]}
[-9.77339682 -5.32765251 -4.92494192 -4.9763092 ]


In [None]:
tree_4 = GridSearchCV(tree_rg, param_4, cv=4, scoring='neg_mean_squared_error')
tree_4.fit(X_train_prepared, y_train)
score_4 = tree_4.cv_results_['mean_test_score']

In [None]:
print(param_4)
print(score_4)

{'min_samples_leaf': [1, 10, 50, 100, 500]}
[-9.77339682 -5.82961265 -5.00893704 -4.91045277 -5.03726477]


In [None]:
tree_5 = GridSearchCV(tree_rg, param_5, cv=4, scoring='neg_mean_squared_error')
tree_5.fit(X_train_prepared, y_train)
score_5 = tree_5.cv_results_['mean_test_score']

In [None]:
print(param_5)
print(score_5)

{'max_leaf_nodes': [None, 100, 50, 10, 5, 2]}
[-9.77339682 -5.25392684 -5.36523318 -5.73178569 -5.76333955 -5.80788125]


#### Decision Tree Tuning - All parameters

In [None]:
hyperparamters_2 = {'max_depth': randint(5,50),
                    'min_samples_split': randint(100,1000),
                    'min_samples_leaf': randint(50,500),
                    'max_leaf_nodes': randint(10,100)}

In [None]:
tree_rg_rand = RandomizedSearchCV(tree_rg, hyperparamters_2, cv=4, scoring='neg_mean_squared_error')
tree_rg_rand.fit(X_train_prepared, y_train)

RandomizedSearchCV(cv=4, estimator=DecisionTreeRegressor(random_state=42),
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f749d6740d0>,
                                        'max_leaf_nodes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f749d685150>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f749d6855d0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f749d674b90>},
                   scoring='neg_mean_squared_error')

In [None]:
print(tree_rg_rand.best_params_)
print(tree_rg_rand.best_score_)

{'max_depth': 41, 'max_leaf_nodes': 90, 'min_samples_leaf': 118, 'min_samples_split': 917}
-5.287827624856116


In [None]:
final_tree_rg = tree_rg_rand.best_estimator_

In [None]:
get_top_features(final_tree_rg,40)

[('x0_6AM-10AM', 0.22403327269114473),
 ('POI_distance', 0.19147218806713792),
 ('capacity', 0.1048572306492542),
 ('x0_2PM-6PM', 0.058055625979797465),
 ('x1_Queen', 0.05798088208631442),
 ('Is_Weekend', 0.05749935936864228),
 ('x1_St Patrick', 0.048389946711610586),
 ('x1_St Andrew', 0.03676101732362192),
 ('x0_10PM-2AM', 0.030715148575596384),
 ('x1_Dundas', 0.02977448681702831),
 ("x1_Queen's Park", 0.02934843768901777),
 ('x1_Sherbourne', 0.022297027100466153),
 ('x0_6PM-10PM', 0.019440936349516522),
 ('x1_Wellesley', 0.01754830542978994),
 ('x1_Union Station', 0.01675114199915192),
 ('x0_10AM-2PM', 0.014635865870426527),
 ('x1_King', 0.009186263679933688),
 ('x1_Osgoode', 0.008204211340984525),
 ('Lockdown_Stay_at_Home', 0.008061286821555259),
 ('x1_Museum Station', 0.004337411453704768),
 ('x1_Bay', 0.0037611341280126293),
 ('x1_Dufferin', 0.003534582334335173),
 ('x1_Bloor-Yonge', 0.0033542375329568747),
 ('Temp (°C)', 0.0),
 ('Precip. Amount (mm)', 0.0),
 ('Wind Spd (km/h)', 0

### Testing with final decision tree model

In [None]:
X_test = df_test[[col for col in df.columns if col not in cols_to_drop+target]]
y_test = df_test[target]

In [None]:

X_test_prepared=full_pipeline.transform(X_test)
final_tree_rg = tree_rg_rand.best_estimator_
y_test_pred = final_tree_rg.predict(X_test_prepared)

In [None]:
print(np.sqrt(mean_squared_error(y_test, y_test_pred)))

2.2978413236775963


In [None]:
print(mean_squared_error(y_test, y_test_pred))

5.280074748800407
