In [57]:
import pandas as pd
import numpy
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

pd.set_option('chained_assignment',None)

In [None]:
file = pd.read_csv('data.csv')

In the cells below, I setting up my data, so that I can train and test.

In [3]:
sub_df = file[['Ad1', 'Ad1Summ1', 'Ad1Summ2', 'Ad1Keystone', 'Ad1Xp', 'Ad1Gold',
                     'Ad2', 'Ad2Summ1', 'Ad2Summ2', 'Ad2Keystone', 'Ad2Xp', 'Ad2Gold',
                     'Supp1', 'Supp1Summ1', 'Supp1Summ2', 'Supp1Keystone', 'Supp1Xp', 'Supp1Gold',
                     'Supp2', 'Supp2Summ1', 'Supp2Summ2', 'Supp2Keystone', 'Supp2Xp', 'Supp2Gold']]

sub_df['AdXpDiff'] = (sub_df['Ad1Xp'] * 10) - (sub_df['Ad2Xp'] * 10)
sub_df['AdGoldDiff'] = (sub_df['Ad1Gold'] * 10) - (sub_df['Ad2Gold'] * 10)
sub_df['SuppXpDiff'] = (sub_df['Supp1Xp'] * 10) - (sub_df['Supp2Xp'] * 10)
sub_df['SuppGoldDiff'] = (sub_df['Supp1Gold'] * 10) - (sub_df['Supp2Gold'] * 10)

print(sub_df)

      Ad1  Ad1Summ1  Ad1Summ2  Ad1Keystone  Ad1Xp  Ad1Gold  Ad2  Ad2Summ1  \
0     202         4         7         8128  305.8    222.3   29         7   
1      50         3         4         8010  285.7    213.5  202         4   
2     236         7         4         8005  235.3    156.7   74        14   
3     202         7         4         8128  297.2    316.9  222         4   
4      21         7         4         8229  289.1    318.9  145         7   
...   ...       ...       ...          ...    ...      ...  ...       ...   
9743  202         4         7         8128  331.6    317.6  145         7   
9744   21         4         7         8128  288.6    233.7   67         4   
9745   15         7         4         8008  267.4    260.0   21         7   
9746   21         7         4         8005  289.0    248.5  236         7   
9747  523         7         4         8005  275.3    198.2   22         4   

      Ad2Summ2  Ad2Keystone  ...  Supp2  Supp2Summ1  Supp2Summ2  \
0       

In [4]:
x = sub_df[['Ad1', 'Ad1Summ1', 'Ad1Summ2', 'Ad1Keystone', 'Ad2', 'Ad2Summ1', 'Ad2Summ2', 'Ad2Keystone',
           'Supp1', 'Supp1Summ1', 'Supp1Summ2', 'Supp1Keystone', 'Supp2', 'Supp2Summ1', 'Supp2Summ2', 'Supp2Keystone']]

ad_df = sub_df[['Ad1', 'Ad1Summ1', 'Ad1Summ2', 'Ad1Keystone', 'Ad2', 'Ad2Summ1', 'Ad2Summ2', 'Ad2Keystone']]

ad_xp_diff = sub_df['AdXpDiff']
ad_g_diff = sub_df['AdGoldDiff']
supp_xp_diff = sub_df['SuppXpDiff']
supp_g_diff = sub_df['SuppGoldDiff']

Now that my data is ready to be used, I will be comparing a Decision Tree regressor vs a Random Forest Regressor with no parameters to see which performs better. To test my models, I will be training and testing with splits and evaluating the mean squared error.

In [160]:
# ad exp diff
x_train, x_test, y_train, y_test = train_test_split(x, ad_xp_diff, test_size=0.3, random_state=0)

In [162]:
testing_forest = RandomForestRegressor(random_state=0)
testing_forest.fit(x_train, y_train)

RandomForestRegressor(random_state=0)

In [163]:
testing_tree = DecisionTreeRegressor(random_state=0)
testing_tree.fit(x_train, y_train)

DecisionTreeRegressor(random_state=0)

In [164]:
y_forest = testing_forest.predict(x_test)
y_tree = testing_tree.predict(x_test)

In [165]:
forest_mse = mean_squared_error(y_test, y_forest)
print('Random Forest MSE : {}'.format(numpy.sqrt(forest_mse)))

tree_mse = mean_squared_error(y_test, y_tree)
print('Decision Tree MSE : {}'.format(numpy.sqrt(tree_mse)))

Random Forest MSE : 935.2691280702388
Random Tree MSE : 1343.9165511333865


Based on the results above, it appears that the random forest regressor performs much better than the decision tree regressor for this dataset. Therefore, I will continue using the random forest regressor and will begin to tune the parameters to get better results. I will be tuning the parameters of the model through scikit-learn's CVGridSearch. 

In [166]:
forest = RandomForestRegressor(random_state=0)

# example of a paramter grid that I would use for CVGridSearch
#param_grid = {
#    'n_estimators' : [400, 500, 600],
#    'max_depth' : [3,4,5,6,7],
#    'max_features': ['sqrt', 'auto', 'log2'],
#    'max_leaf_nodes': [30, 40, 50],
#    'criterion' : ['mse']
#}

param_grid = {
    'n_estimators' : [410, 420, 430],
    'max_depth' : [9, 10],
    'max_features': ['sqrt'],
    'max_leaf_nodes': [50, 60],
    'criterion' : ['mse']
}

In [167]:
cv_grid = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5, verbose=1, n_jobs=4)
cv_grid.fit(x_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.2s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:   16.6s finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=0), n_jobs=4,
             param_grid={'criterion': ['mse'], 'max_depth': [9, 10],
                         'max_features': ['sqrt'], 'max_leaf_nodes': [50, 60],
                         'n_estimators': [410, 420, 430]},
             verbose=1)

In [168]:
cv_grid.best_params_

{'criterion': 'mse',
 'max_depth': 10,
 'max_features': 'sqrt',
 'max_leaf_nodes': 60,
 'n_estimators': 430}

In [169]:
model2 = RandomForestRegressor(n_estimators=cv_grid.best_params_['n_estimators'], max_depth=cv_grid.best_params_['max_depth'], max_leaf_nodes=cv_grid.best_params_['max_leaf_nodes'],
                               max_features=cv_grid.best_params_['max_features'], random_state=0)
model2.fit(x_train, y_train)

RandomForestRegressor(max_depth=10, max_features='sqrt', max_leaf_nodes=60,
                      n_estimators=430, random_state=0)

In [170]:
y2 = model2.predict(x_test)

In [171]:
mse2 = mean_squared_error(y_test, y2)
rmse = numpy.sqrt(mse2)
rmse

923.7784235654492

After finding the parameters for my model, you can see that the model's mean squared error only improved by around 10. There is some improvement from the default random forest regressor when validating with the train and test splits. An MSE of about 566 seems to be the limit as any kind of improvement from hypertuning parameters is an insignificant amount. 

Instead of validating with train_test_split, I will try cross validation to see if my validation results would become better. 

In [156]:
scores = -1 * cross_val_score(model2, x, ad_xp_diff,
                        cv=5 , scoring="neg_mean_squared_error")
scores

array([316839.71577186, 323887.46769241, 342037.31446344, 318803.5858504 ,
       318668.43861038])

In [159]:
numpy.sqrt(sum (scores) / len(scores))

569.2515300617989

After finding the average MSE for each fold in our cross validation and square rooting it, there actually isn't that much of a difference compared to the train_test_split.  