# kaggle Titanic Problem

Load packages and data.

In [14]:
# Load packages.
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV

# Load (cleaned) data.
train = pd.read_csv('./data/train_clean.csv')
test = pd.read_csv('./data/test_clean.csv')

In [15]:
rf = RandomForestClassifier()

# Look at parameters used by default forest.
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


## Random Hyperparameter Grid

Grid of hyperparameters from which to sample.

In [68]:
# Number of trees in random forest.
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
print(n_estimators)


# Number of features to consider at every split
max_features = ['auto', 'sqrt']
print(max_features)


# Maximum number of levels in tree.
max_depth = [int(x) for x in np.linspace(1,10, num=10)]
max_depth.append(None)
print(max_depth)

[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
['auto', 'sqrt']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]


In [69]:
# Create the random grid
random_grid = {'n_estimators' : n_estimators,
               'max_features' : max_features,
               'max_depth' : max_depth}
pprint(random_grid)

{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None],
 'max_features': ['auto', 'sqrt'],
 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}


In [87]:
# Use the random grid to search for best hyperparameters.

# First create the base model to tune.
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, search across all 120 different combinations.
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=120, cv=3, verbose=2, n_jobs=-1)

In [88]:
# Fit the random search model
y = train['Survived']
features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Age']
X = pd.get_dummies(train[features])
X_test = pd.get_dummies(test[features])

rf_random.fit(X, y)


Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.2min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [89]:
# View best hyperparameters.
rf_random.best_params_

{'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': 5}

In [None]:


# Test new model against base model.
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    error = np.mean(abs(predictions - test_labels) / test_labels)
    print('Average Error: {:0.4f}.'.format(error))
    
    return

base_model = RandomForestClassifier(n_estimators=150, max_depth=6, random_state=1)
base_model.fit(X, y)
base_error = evaluate(base_model, )


In [95]:
best_random_rf = RandomForestClassifier(n_estimators=200, max_features='sqrt', max_depth=5, random_state=1)
best_random_rf.fit(X, y)
predictions = best_random_rf.predict(X_test).astype(int)

print(predictions)

[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 1]


In [96]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('./submissions/random_forest_best_random.csv', index=False)
print('Your submission was successfully saved!')

Your submission was successfully saved!


In [94]:
output['Survived']

output.dtypes

PassengerId    int64
Survived       int64
dtype: object

In [76]:
gender_sub = pd.read_csv('./submissions/gender_submission.csv')

gender_sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
