# Airline Passenger Satisfaction - Ensemble: Random Forest
----
## Load data

In [None]:
%run ./01_data_prep.ipynb
%run ./utils.ipynb

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier()

clf = clf.fit(X_train,y_train)

y_pred = clf.predict(X_valid)


In [None]:
clf.max_leaf_nodes

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_valid, y_pred)

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
param_grid = [
{'n_estimators': [10, 25, 100, 200], 'max_features': [10, 25, 30], 
 'max_depth': [50, None], 'bootstrap': [True, False]}
]
# Cross validating data with 5 folds
dt_gs = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)

dt_gs.fit(X_train,y_train)

In [None]:
# Check score when n = best value according to grid search
dt_gs.best_score_

In [None]:
dt_gs.best_params_

In [None]:
p = [1, 10, 25, 50, 100]
lst_test =[]
lst_train =[]
for i in p:
    dt = RandomForestClassifier(max_depth = i, n_jobs=-1)
    dt.fit(X_train, y_train)
    z = dt.score(X_valid, y_valid)
    t = dt.score(X_train, y_train)
    lst_test.append(z)
    lst_train.append(t)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(p, lst_test, color ='red', label ='Test Accuracy')
plt.plot(p, lst_train, color ='b', label ='Train Accuracy')
plt.xlabel('Model Complexity --->')
plt.title('Best value of max_depth')
plt.legend()
plt.savefig('RF_complexity_depth.png')

In [None]:
# Using default params
model = RandomForestClassifier()

plot_learning_curve(model, "RF Learning Curve", X_train, y_train, n_jobs=-1)
plt.savefig("learning_curve_RF_big.png")