In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('train.csv')

df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

age_median = df['Age'].median()
df['Age'] = df['Age'].fillna(age_median)

df['Embarked'] = df['Embarked'].fillna('S')

df['Sex'] = df['Sex'].map({'female':0, 'male':1})
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

df = df.drop(['Embarked'], axis=1)

df_train = df.iloc[:712, :]
df_test = df.iloc[712:, :]

X_train = df_train.iloc[:, 2:].values
Y_train = df_train['Survived']

X_test = df_test.iloc[:, 2:].values
Y_test = df_test['Survived']

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

parameter_grid = {
    'max_features': [0.5, 1.],
    'max_depth': [5., None]
}

grid_search = GridSearchCV(RandomForestClassifier(n_estimators=100), parameter_grid, cv=5, verbose=3)



In [3]:
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] max_features=0.5, max_depth=5.0 .................................
[CV] ........ max_features=0.5, max_depth=5.0, score=0.804196 -   0.3s
[CV] max_features=0.5, max_depth=5.0 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ........ max_features=0.5, max_depth=5.0, score=0.790210 -   0.2s
[CV] max_features=0.5, max_depth=5.0 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] ........ max_features=0.5, max_depth=5.0, score=0.895105 -   0.2s
[CV] max_features=0.5, max_depth=5.0 .................................
[CV] ........ max_features=0.5, max_depth=5.0, score=0.823944 -   0.2s
[CV] max_features=0.5, max_depth=5.0 .................................
[CV] ........ max_features=0.5, max_depth=5.0, score=0.773050 -   0.3s
[CV] max_features=1.0, max_depth=5.0 .................................
[CV] ........ max_features=1.0, max_depth=5.0, score=0.811189 -   0.3s
[CV] max_features=1.0, max_depth=5.0 .................................
[CV] ........ max_features=1.0, max_depth=5.0, score=0.811189 -   0.3s
[CV] max_features=1.0, max_depth=5.0 .................................
[CV] ........ max_features=1.0, max_depth=5.0, score=0.867133 -   0.3s
[CV] max_features=1.0, max_depth=5.0 .................................
[CV] ........ max_features=1.0, max_depth=5.0, score=0.809859 -   0.2s
[CV] max_features=1.0, max_depth=5.0 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [0.5, 1.0], 'max_depth': [5.0, None]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [4]:
grid_search.grid_scores_

[mean: 0.81742, std: 0.04234, params: {'max_features': 0.5, 'max_depth': 5.0},
 mean: 0.81882, std: 0.02502, params: {'max_features': 1.0, 'max_depth': 5.0},
 mean: 0.80478, std: 0.02898, params: {'max_features': 0.5, 'max_depth': None},
 mean: 0.81039, std: 0.02781, params: {'max_features': 1.0, 'max_depth': None}]

In [5]:
grid_search.best_params_

{'max_depth': 5.0, 'max_features': 1.0}

In [6]:
model = RandomForestClassifier(n_estimators=100, max_features=1.0, max_depth=5.0, random_state=0)
model = model.fit(X_train, Y_train)

In [7]:
Y_prediction = model.predict(X_test)

In [8]:
np.sum(Y_prediction == Y_test) / float(len(Y_test))

0.86033519553072624

In [11]:
from sklearn.cross_validation import KFold

X = df.iloc[:, 2:].values
Y = df['Survived']

cv = KFold(n=len(Y), n_folds=5)
results = []

for training_set, test_set in cv:
    X_train = X[training_set]
    Y_train = Y[training_set]
    X_test = X[test_set]
    Y_test = Y[test_set]
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, Y_train)
    Y_prediction = model.predict(X_test)
    result = np.sum(Y_test == Y_prediction) * 1./len(Y_test)
    results.append(result)
    print "Prediction accuracy:", result

print "Overall prediction accuracy:", np.mean(results)

Prediction accuracy: 0.77094972067
Prediction accuracy: 0.808988764045
Prediction accuracy: 0.85393258427
Prediction accuracy: 0.769662921348
Prediction accuracy: 0.825842696629
Overall prediction accuracy: 0.805875337393
