In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import warnings

warnings.filterwarnings('ignore')

%matplotlib inline

### Recall: Logistic Regression Model for Titanic Survival

In [None]:
titanic = pd.read_csv('../data/titanic.csv')

In [None]:
titanic.head()

In [None]:
titanic = titanic.drop(columns = 'Name')
titanic = pd.get_dummies(titanic, columns = ['Sex', 'Pclass'], drop_first = True)

In [None]:
X = titanic.drop(columns = 'Survived')
y = titanic.Survived

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321)

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

### How did we do on the training set?

In [None]:
y_pred_train = logistic_model.predict(X_train)
print(metrics.accuracy_score(y_train, y_pred_train))

### What about on the test set?

In [None]:
y_pred = logistic_model.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))

## What if we try a more flexible model?

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
#import graphviz
from IPython.display import SVG

We'll start by building a single decision tree classifier.

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

## How did we do on the training set?

In [None]:
y_pred_train = tree.predict(X_train)
print(metrics.accuracy_score(y_train, y_pred_train))

## 98.8% accuracy!!! What about on the test set?

In [None]:
y_pred = tree.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))

What happened!? Let's see how our model is making predictions.

In [None]:
#export_graphviz(tree, 'tree.dot', feature_names = X.columns, filled=True, rounded=True, special_characters=False, impurity=False)

#! dot -Tpng tree.dot -o tree.png

from IPython.display import Image
Image(filename='../assets/tree.png') 

The problem is that our model essentially memorizes the training set without trying to uncover patters that it could generalize.

To correct for this problem, we can take an ensemble approach, which means that we will build many decision trees on subsets of the features and data and then average the predictions of all of the trees. This will force our model to try and find more general patterns that will work on the test set.

In [None]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)

## Training Set Accuracy:

In [None]:
y_pred_train = forest.predict(X_train)
print(metrics.accuracy_score(y_train, y_pred_train))

## Test Set Accuracy:

In [None]:
y_pred = forest.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

We see some improvement over using a single tree, but we could do better. Random forests have a lot of hyperparameters that can be tuned to improve out model. Here are a few of these parameters:

* **n_estimators:** Number of decision trees to train. Default is 10. More trees = less variance, but slower to train and predict
* **max_depth:** Maximum depth (number of splits). By default, there is no max depth.
* **min_samples_leaf:** Minimum number of samples per leaf. Setting this higher keeps the decision trees from paying too much attention to any single data point.

These parameters can be tuned to try to improve the model that you get, and there are ways to automatically tune these parameters. See, for example, sklearn's GridSearchCV or RandomSearchCV.

In [None]:
forest = RandomForestClassifier(n_estimators = 1000, max_depth = 5, min_samples_leaf = 5)
forest.fit(X_train, y_train)

In [None]:
y_pred_train = forest.predict(X_train)
print(metrics.accuracy_score(y_train, y_pred_train))

In [None]:
y_pred = forest.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

A nice perk of using random forest models is that we can see which features are the most important in making predictions.

In [None]:
fig, ax = plt.subplots(figsize = (7,5))
sns.barplot(x = forest.feature_importances_, y = list(X.columns), ax = ax, edgecolor = 'black')
plt.title('Random Forest Feature Importance');