In [1]:
import pandas as pd
import numpy
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

pd.set_option('chained_assignment',None)
file = pd.read_csv('data.csv')

In [2]:
sub_df = file[['Top1', 'Top1Summ1', 'Top1Summ2', 'Top1Keystone', 'Top1Xp', 'Top1Gold',
                     'Top2', 'Top2Summ1', 'Top2Summ2', 'Top2Keystone', 'Top2Xp', 'Top2Gold']]

sub_df['TopXpDiff'] = (sub_df['Top1Xp'] * 10) - (sub_df['Top2Xp'] * 10)

x = sub_df[['Top1', 'Top1Summ1', 'Top1Summ2', 'Top1Keystone', 'Top2', 'Top2Summ1', 'Top2Summ2', 'Top2Keystone']]

xp_diff = sub_df['TopXpDiff']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, xp_diff, test_size=0.3, random_state=0)

testing_forest = RandomForestRegressor(random_state=0)
testing_forest.fit(x_train, y_train)

testing_tree = DecisionTreeRegressor(random_state=0)
testing_tree.fit(x_train, y_train)

y_forest = testing_forest.predict(x_test)
y_tree = testing_tree.predict(x_test)

forest_mse = mean_squared_error(y_test, y_forest)
print('Random Forest MSE : {}'.format(numpy.sqrt(forest_mse)))

tree_mse = mean_squared_error(y_test, y_tree)
print('Decision Tree MSE : {}'.format(numpy.sqrt(tree_mse)))

Random Forest MSE : 998.5760734786164
Decision Tree MSE : 1338.0114896982677


In [12]:
forest = RandomForestRegressor(random_state=0)

param_grid = {
    'n_estimators' : [650, 675],
    'max_depth' : [10],
    'max_features': ['sqrt'],
    'max_leaf_nodes': [70, 75],
    'criterion' : ['mse']
}

cv_grid = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5, verbose=1, n_jobs=6)
cv_grid.fit(x_train, y_train)
cv_grid.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    4.8s finished


{'criterion': 'mse',
 'max_depth': 10,
 'max_features': 'sqrt',
 'max_leaf_nodes': 75,
 'n_estimators': 675}

In [13]:
model2 = RandomForestRegressor(n_estimators=cv_grid.best_params_['n_estimators'], max_depth=cv_grid.best_params_['max_depth'], max_leaf_nodes=cv_grid.best_params_['max_leaf_nodes'],
                               max_features=cv_grid.best_params_['max_features'], random_state=0)
model2.fit(x_train, y_train)

y2 = model2.predict(x_test)

mse2 = mean_squared_error(y_test, y2)
rmse = numpy.sqrt(mse2)
print(rmse)

scores = -1 * cross_val_score(model2, x, xp_diff,
                        cv=5 , scoring="neg_mean_squared_error")
scores
numpy.sqrt(sum (scores) / len(scores))

944.1011799548132


943.4626033509493