In [1]:
import pandas as pd

### Load your pickled data

In [2]:
features_numeric = pd.read_pickle('../1.Titanic/Data/features_numeric.pickle')

In [3]:
target = pd.read_pickle('../1.Titanic/Data/target.pickle')

### Train_test_split
After doing some initial cleaning on your data, split your dataset into train (what you'll be training your algorithm on) and test sets (what you'll be testing your algorithm on). 

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_numeric, target, test_size=.3, random_state=42)

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Supervised Machine Learning
**Decision Trees:**

https://blog.socialcops.com/engineering/machine-learning-python/:
> >
A decision tree examines one variable at a time, and splits into one of two branches based on the result of that value, at which point it does the same for the next variable. A fantastic visual explanation of how decision trees work can be found here: http://www.r2d3.us/visual-intro-to-machine-learning-part-1/.

To create this tree, we first initialize an instance of an untrained decision tree classifier. (Here we will set the maximum depth of the tree to 10). Next we “fit” this classifier to our training set, enabling it to learn about how different factors affect the survivability of a passenger. Now that the decision tree is ready, we can “score” it using our test data to determine how accurate it is.

In [7]:
from sklearn.tree import DecisionTreeClassifier

In [8]:
# We will start off with a simple decision tree classifier
dtc = DecisionTreeClassifier(max_depth=10, random_state=42)

In [9]:
dtc.fit (X_train, y_train)
dtc.score (X_test, y_test)
# The model correctly predicted the survival of 77% of the test set. Not bad for our first model!

0.77551020408163263

To build a more robust model, import GridSearchCV

In [16]:
from sklearn.model_selection import GridSearchCV

In [11]:
params = {
    'criterion':['gini', 'entropy'],
    'max_depth':[1,2,4,8,16, None],
    'min_samples_leaf':[1,5,10,15]
}

In [14]:
gs_dtc = GridSearchCV(DecisionTreeClassifier(), param_grid=params, cv=5)

In [15]:
gs_dtc.fit(X_train_scaled, y_train)
#this might take a minute because we're building 6*8 (48) different models

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [1, 2, 4, 8, 16, None], 'min_samples_leaf': [1, 5, 10, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [17]:
gs_dtc.best_estimator_
# This is our best model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [18]:
gs_dtc.score(X_train_scaled, y_train)
# This is a much better score than our benchmark - but this is only on trained data

0.82494529540481398

In [20]:
gs_dtc.score(X_test_scaled, y_test)
# This is only slightly better than our benchmark

0.7857142857142857