In [1]:
import pandas as pd
import numpy
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

pd.set_option('chained_assignment',None)
file = pd.read_csv('data.csv')

In [3]:
sub_df = file[['Ad1', 'Ad1Summ1', 'Ad1Summ2', 'Ad1Keystone', 'Ad1Xp', 'Ad1Gold',
                     'Ad2', 'Ad2Summ1', 'Ad2Summ2', 'Ad2Keystone', 'Ad2Xp', 'Ad2Gold',
                     'Supp1', 'Supp1Summ1', 'Supp1Summ2', 'Supp1Keystone', 'Supp1Xp', 'Supp1Gold',
                     'Supp2', 'Supp2Summ1', 'Supp2Summ2', 'Supp2Keystone', 'Supp2Xp', 'Supp2Gold']]

sub_df['SuppXpDiff'] = (sub_df['Supp1Xp'] * 10) - (sub_df['Supp2Xp'] * 10)

x = sub_df[['Ad1', 'Ad1Summ1', 'Ad1Summ2', 'Ad1Keystone', 'Ad2', 'Ad2Summ1', 'Ad2Summ2', 'Ad2Keystone',
           'Supp1', 'Supp1Summ1', 'Supp1Summ2', 'Supp1Keystone', 'Supp2', 'Supp2Summ1', 'Supp2Summ2', 'Supp2Keystone']]

supp_xp_diff = sub_df['SuppXpDiff']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, supp_xp_diff, test_size=0.3, random_state=0)

testing_forest = RandomForestRegressor(random_state=0)
testing_forest.fit(x_train, y_train)

testing_tree = DecisionTreeRegressor(random_state=0)
testing_tree.fit(x_train, y_train)

y_forest = testing_forest.predict(x_test)
y_tree = testing_tree.predict(x_test)

forest_mse = mean_squared_error(y_test, y_forest)
print('Random Forest MSE : {}'.format(numpy.sqrt(forest_mse)))

tree_mse = mean_squared_error(y_test, y_tree)
print('Decision Tree MSE : {}'.format(numpy.sqrt(tree_mse)))

Random Forest MSE : 558.1903685641159
Random Tree MSE : 781.3327104661909


In [7]:
forest = RandomForestRegressor(random_state=0)

param_grid = {
    'n_estimators' : [600],
    'max_depth' : [11],
    'max_features': ['sqrt'],
    'max_leaf_nodes': [60],
    'criterion' : ['mse']
}

cv_grid = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5, verbose=1, n_jobs=6)
cv_grid.fit(x_train, y_train)
cv_grid.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    6.7s finished


{'criterion': 'mse',
 'max_depth': 11,
 'max_features': 'sqrt',
 'max_leaf_nodes': 60,
 'n_estimators': 600}

In [10]:
model2 = RandomForestRegressor(n_estimators=cv_grid.best_params_['n_estimators'], max_depth=cv_grid.best_params_['max_depth'], max_leaf_nodes=cv_grid.best_params_['max_leaf_nodes'],
                               max_features=cv_grid.best_params_['max_features'], random_state=0)
model2.fit(x_train, y_train)

y2 = model2.predict(x_test)

mse2 = mean_squared_error(y_test, y2)
rmse = numpy.sqrt(mse2)
rmse

scores = -1 * cross_val_score(model2, x, supp_xp_diff,
                        cv=5 , scoring="neg_mean_squared_error")
scores
numpy.sqrt(sum (scores) / len(scores))

548.4011331825341