In [1]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_svmlight_file
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [2]:
# load data in LibSVM sparse data format
X_train, y_train = load_svmlight_file("a9a.txt")
X_test, y_test = load_svmlight_file("a9a.t")

In [3]:
#create Random Forest model with default parameters (to use it in GridSearchCV)
model = RandomForestClassifier(max_features='sqrt')
model.fit(X_train, y_train)
#print(model.get_params())

In [4]:
# define possible hyperparameters
params = {'n_estimators': [10, 50, 100],
          'bootstrap': [True, False],
          'max_depth': [2, 5, 10],
          'min_impurity_decrease': [0.0, 0.1, 0.2],
          'min_samples_leaf': [1, 5, 10]}

In [5]:
#Run GridSearchCV
tuning_rf = GridSearchCV(estimator = model, param_grid = params, scoring = 'accuracy', cv = 5, n_jobs = -1)

#fit model on training data to obtain best hyperparameters
tuning_rf.fit(X_train, y_train)

#obtain best hyperparameters
print(tuning_rf.best_params_)

#initialize tuned final model
tuned_rf=tuning_rf.best_estimator_

{'bootstrap': False, 'max_depth': 10, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 5, 'n_estimators': 50}


In [6]:
#fit tuned model on training data
tuned_rf.fit(X_train,y_train)

#obtain training accuracy/error:
train_pred=tuned_rf.predict(X_train)
train_accuracy=accuracy_score(y_train, train_pred)

print("Training Accuracy:",train_accuracy*100, "%","Training Error:",100-(train_accuracy*100),"%")

Training Accuracy: 84.51521759159732 % Training Error: 15.48478240840268 %


In [7]:
#Run 5-fold Cross Validation on the model with best parameters and get the accuracy
cross_val_accuracy=cross_val_score(tuned_rf,X_train,y_train,cv=5).mean()

print("Cross Vaidation Accuracy:",cross_val_accuracy*100,"%","Cross Validation Error:" ,(100-(cross_val_accuracy*100)),"%")

Cross Vaidation Accuracy: 83.95014403247936 % Cross Validation Error: 16.049855967520642 %


In [8]:
# make predictions for test data
y_pred = model.predict(X_test)

# Obtain tessting accuracy/error:
test_accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy:",test_accuracy*100,"%","Test Error:",100-(test_accuracy*100),"%")

Test Accuracy: 83.40396781524476 % Test Error: 16.596032184755245 %


In [9]:
#Accuracy of configuration of parameters
accuracy_config_pd=pd.concat([pd.DataFrame(tuning_rf.cv_results_["params"]),
           pd.DataFrame(tuning_rf.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
print(accuracy_config_pd)

     bootstrap  max_depth  min_impurity_decrease  min_samples_leaf  \
0         True          2                    0.0                 1   
1         True          2                    0.0                 1   
2         True          2                    0.0                 1   
3         True          2                    0.0                 5   
4         True          2                    0.0                 5   
..         ...        ...                    ...               ...   
157      False         10                    0.2                 5   
158      False         10                    0.2                 5   
159      False         10                    0.2                10   
160      False         10                    0.2                10   
161      False         10                    0.2                10   

     n_estimators  Accuracy  
0              10  0.764595  
1              50  0.764319  
2             100  0.767636  
3              10  0.765886  
4        