In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
# Loading data
train = pd.read_csv('../../data/class_regression/prepocessed_train_data.csv', index_col='id')
test = pd.read_csv('../../data/class_regression/prepocessed_test_data.csv', index_col='id')
X1_train = train.loc[train['playtime_forever'] > 0, : ]
y1_train = X1_train['playtime_forever']
X1_train = X1_train.drop(columns=['playtime_forever', 'is_play'])
X2_train = train.drop(columns=['playtime_forever', 'is_play'])
y2_train = train['is_play']

In [22]:
# Tuning hyperparameter
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfr_init = RandomForestRegressor()
rfc_init = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
# rfr = RandomizedSearchCV(estimator = rfr_init, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# rfr.fit(X1_train, y1_train)

rfc = RandomizedSearchCV(estimator = rfc_init, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rfc.fit(X2_train, y2_train)



Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   47.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.6min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [17]:
rfr.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [18]:
rfr = RandomForestRegressor(**{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True})

In [23]:
rfc.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [24]:
rfc = RandomForestClassifier(**{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True})
rfc.fit(X2_train, y2_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
cross_val_score(rfc, X2_train, y2_train, cv=10).mean()

0.6965592903828198

In [26]:
cross_val_score(rfr, X1_train, y1_train, cv=10).mean()

0.007878776997807679

In [28]:
rfr.fit(X1_train, y1_train)
rfc.fit(X2_train, y2_train)
is_play = rfc.predict(test)
playtime_forever = rfr.predict(test)
submission = pd.DataFrame(data={
    'playtime_forever': playtime_forever,
    'is_play': is_play
})
submission.loc[submission['is_play'] == 0, 'playtime_forever'] = 0
submission = submission.drop(columns=['is_play'])
submission.index.name = 'id'
submission.to_csv('../../submission/cla_reg_datasplit/rf1.csv')