In [45]:
import os
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

In [46]:
train = pd.read_csv('train.csv')
train.set_index(train['PassengerId'],inplace=True)
train.head().T

PassengerId,1,2,3,4,5
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


## clean up the data

In [47]:
print len(train)
print sum(list(train.groupby('Sex').size()))
train.groupby('Sex').size()

891
891


Sex
female    314
male      577
dtype: int64

In [48]:
def clean_titanic_data(df):
    #clean embarked
    df['Embarked'] = df['Embarked'].apply(lambda x: 0 if x=='Q' else x)
    df['Embarked'] = df['Embarked'].apply(lambda x: 1 if x=='S' else x)
    df['Embarked'] = df['Embarked'].apply(lambda x: 2 if x=='C' else x)
    df['Embarked'] = df['Embarked'].fillna('3')
    #clean age
    avg_age = np.average(df[(df['Age'].notnull())]['Age'])
    print "avg age: ",avg_age
    df['Age'] = df['Age'].fillna(avg_age)
    #clean fare
    avg_fare = np.average(df[(df['Fare'].notnull())]['Fare'])
    print "avg Fare: ",avg_fare
    df['Fare'] = df['Fare'].fillna(avg_fare)
    #turn sex to number
    df['Sex'] = df['Sex'].apply(lambda x: 0 if x=='female' else 1)
    df['Sex'] = df['Sex'].fillna(2)
    #drop other stuff while testing
    df.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)
    
    return df
    

In [49]:
clean_train = clean_titanic_data(train)

avg age:  29.6991176471
avg Fare:  32.2042079686


In [50]:
clean_train.head().T

PassengerId,1,2,3,4,5
Survived,0.0,1.0,1.0,1.0,0.0
Pclass,3.0,1.0,3.0,1.0,3.0
Sex,1.0,0.0,0.0,0.0,1.0
Age,22.0,38.0,26.0,35.0,35.0
SibSp,1.0,1.0,0.0,1.0,0.0
Parch,0.0,0.0,0.0,0.0,0.0
Fare,7.25,71.2833,7.925,53.1,8.05
Embarked,1.0,2.0,1.0,1.0,1.0


## test out some stuff

In [51]:
clean_train_y = clean_train[[0]]
clean_train.drop(['Survived'], axis=1, inplace=True)
df_train_index, df_test_index = train_test_split(clean_train.index, test_size=0.35)
df_test_x = clean_train.loc[df_test_index]
df_train_x = clean_train.loc[df_train_index]
df_test_y = clean_train_y.loc[df_test_index]
df_train_y = clean_train_y.loc[df_train_index]

In [52]:
df_train_y.head().T

PassengerId,761,891,78,16,499
Survived,0,0,0,1,0


In [53]:
df_train_x.head().T

PassengerId,761,891,78,16,499
Pclass,3.0,3.0,3.0,2,1.0
Sex,1.0,1.0,1.0,0,0.0
Age,29.69912,32.0,29.69912,55,25.0
SibSp,0.0,0.0,0.0,0,1.0
Parch,0.0,0.0,0.0,0,2.0
Fare,14.5,7.75,8.05,16,151.55
Embarked,1.0,0.0,1.0,1,1.0


In [54]:
estimator = GradientBoostingClassifier()
param_grid = dict(loss = ['deviance', 'exponential'], 
                  n_estimators = [10,25,50,100], 
                  max_features = [None,'auto'], 
                  max_depth = [2,3,5], 
                  min_samples_split = [2,10,25]
                 )


# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# estimator = RandomForestClassifier()
# param_grid = dict(n_estimators = [10,25,50,100], max_features=[None], class_weight = ['subsample','auto'],
#                  criterion = ['gini','entropy'], max_depth = [2,3,5], min_samples_split = [2,10,25])

# estimator = ExtraTreesClassifier()
# param_grid = dict(n_estimators = [10,25,50,100], max_features=[None], class_weight = ['subsample','auto'],
#                  criterion = ['gini','entropy'], max_depth = [2,3,5], min_samples_split = [2,10,25])


grid_search = GridSearchCV(estimator, param_grid = param_grid, cv = 2, n_jobs=1, verbose=0)

In [55]:
grid_search.fit(df_train_x,df_train_y['Survived'])

Fitting 2 folds for each of 144 candidates, totalling 288 fits
[CV] max_features=None, loss=deviance, min_samples_split=2, n_estimators=10, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=2, n_estimators=10, max_depth=2, score=0.803448 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=2, n_estimators=10, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=2, n_estimators=10, max_depth=2, score=0.806228 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=2, n_estimators=25, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=2, n_estimators=25, max_depth=2, score=0.813793 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=2, n_estimators=25, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=2, n_estimators=25, max_depth=2, score=0.816609 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=2, n_estimators=50, max_depth=2 
[CV]  max_features=None, lo

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:    0.1s



[CV] max_features=None, loss=deviance, min_samples_split=2, n_estimators=50, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=2, n_estimators=50, max_depth=2, score=0.813149 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=2, n_estimators=100, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=2, n_estimators=100, max_depth=2, score=0.817241 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=2, n_estimators=100, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=2, n_estimators=100, max_depth=2, score=0.813149 -   0.1s
[CV] max_features=None, loss=deviance, min_samples_split=10, n_estimators=10, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=10, n_estimators=10, max_depth=2, score=0.803448 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=10, n_estimators=10, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=10, n_estimators=10, max

[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:    0.4s



[CV] max_features=None, loss=deviance, min_samples_split=10, n_estimators=50, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=10, n_estimators=50, max_depth=2, score=0.813149 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=2, score=0.813793 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=2, score=0.816609 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=2, score=0.803448 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=25, n_estimators=1

[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:    0.7s



[CV] max_features=None, loss=deviance, min_samples_split=25, n_estimators=100, max_depth=2 
[CV]  max_features=None, loss=deviance, min_samples_split=25, n_estimators=100, max_depth=2, score=0.813149 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_split=2, n_estimators=10, max_depth=2 
[CV]  max_features=auto, loss=deviance, min_samples_split=2, n_estimators=10, max_depth=2, score=0.803448 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_split=2, n_estimators=10, max_depth=2 
[CV]  max_features=auto, loss=deviance, min_samples_split=2, n_estimators=10, max_depth=2, score=0.813149 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_split=2, n_estimators=25, max_depth=2 
[CV]  max_features=auto, loss=deviance, min_samples_split=2, n_estimators=25, max_depth=2, score=0.813793 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_split=2, n_estimators=25, max_depth=2 
[CV]  max_features=auto, loss=deviance, min_samples_split=2, n_estimators=25, max_dep

[Parallel(n_jobs=1)]: Done  32 jobs       | elapsed:    0.8s
[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed:    1.0s



[CV]  max_features=auto, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=2, score=0.820069 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=2 
[CV]  max_features=auto, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=2, score=0.755172 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=2 
[CV]  max_features=auto, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=2, score=0.775087 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_split=25, n_estimators=25, max_depth=2 
[CV]  max_features=auto, loss=deviance, min_samples_split=25, n_estimators=25, max_depth=2, score=0.796552 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_split=25, n_estimators=25, max_depth=2 
[CV]  max_features=auto, loss=deviance, min_samples_split=25, n_estimators=25, max_depth=2, score=0.785467 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_spl

[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done  61 jobs       | elapsed:    1.6s



[CV] max_features=None, loss=deviance, min_samples_split=10, n_estimators=50, max_depth=3 
[CV]  max_features=None, loss=deviance, min_samples_split=10, n_estimators=50, max_depth=3, score=0.830450 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=3 
[CV]  max_features=None, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=3, score=0.800000 -   0.1s
[CV] max_features=None, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=3 
[CV]  max_features=None, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=3, score=0.830450 -   0.1s
[CV] max_features=None, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=3 
[CV]  max_features=None, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=3, score=0.813793 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=3 
[CV]  max_features=None, loss=deviance, min_samples_split=25, n_estimators=1

[Parallel(n_jobs=1)]: Done  72 jobs       | elapsed:    1.9s
[Parallel(n_jobs=1)]: Done  85 jobs       | elapsed:    2.3s



[CV] max_features=auto, loss=deviance, min_samples_split=10, n_estimators=50, max_depth=3 
[CV]  max_features=auto, loss=deviance, min_samples_split=10, n_estimators=50, max_depth=3, score=0.820690 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_split=10, n_estimators=50, max_depth=3 
[CV]  max_features=auto, loss=deviance, min_samples_split=10, n_estimators=50, max_depth=3, score=0.823529 -   0.0s
[CV] max_features=auto, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=3 
[CV]  max_features=auto, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=3, score=0.810345 -   0.1s
[CV] max_features=auto, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=3 
[CV]  max_features=auto, loss=deviance, min_samples_split=10, n_estimators=100, max_depth=3, score=0.806228 -   0.1s
[CV] max_features=auto, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=3 
[CV]  max_features=auto, loss=deviance, min_samples_split=25, n_estimators=1

[Parallel(n_jobs=1)]: Done  98 jobs       | elapsed:    2.6s
[Parallel(n_jobs=1)]: Done 113 jobs       | elapsed:    3.5s



[CV] max_features=None, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=5 
[CV]  max_features=None, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=5, score=0.817241 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=5 
[CV]  max_features=None, loss=deviance, min_samples_split=25, n_estimators=10, max_depth=5, score=0.806228 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=25, n_estimators=25, max_depth=5 
[CV]  max_features=None, loss=deviance, min_samples_split=25, n_estimators=25, max_depth=5, score=0.827586 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=25, n_estimators=25, max_depth=5 
[CV]  max_features=None, loss=deviance, min_samples_split=25, n_estimators=25, max_depth=5, score=0.820069 -   0.0s
[CV] max_features=None, loss=deviance, min_samples_split=25, n_estimators=50, max_depth=5 
[CV]  max_features=None, loss=deviance, min_samples_split=25, n_estimators=50, m

[Parallel(n_jobs=1)]: Done 128 jobs       | elapsed:    4.3s
[Parallel(n_jobs=1)]: Done 145 jobs       | elapsed:    5.0s



[CV] max_features=None, loss=exponential, min_samples_split=2, n_estimators=10, max_depth=2 
[CV]  max_features=None, loss=exponential, min_samples_split=2, n_estimators=10, max_depth=2, score=0.813793 -   0.0s
[CV] max_features=None, loss=exponential, min_samples_split=2, n_estimators=10, max_depth=2 
[CV]  max_features=None, loss=exponential, min_samples_split=2, n_estimators=10, max_depth=2, score=0.795848 -   0.0s
[CV] max_features=None, loss=exponential, min_samples_split=2, n_estimators=25, max_depth=2 
[CV]  max_features=None, loss=exponential, min_samples_split=2, n_estimators=25, max_depth=2, score=0.813793 -   0.0s
[CV] max_features=None, loss=exponential, min_samples_split=2, n_estimators=25, max_depth=2 
[CV]  max_features=None, loss=exponential, min_samples_split=2, n_estimators=25, max_depth=2, score=0.809689 -   0.0s
[CV] max_features=None, loss=exponential, min_samples_split=2, n_estimators=50, max_depth=2 
[CV]  max_features=None, loss=exponential, min_samples_split=2

[Parallel(n_jobs=1)]: Done 162 jobs       | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 181 jobs       | elapsed:    5.8s



[CV] max_features=auto, loss=exponential, min_samples_split=10, n_estimators=50, max_depth=2 
[CV]  max_features=auto, loss=exponential, min_samples_split=10, n_estimators=50, max_depth=2, score=0.809689 -   0.0s
[CV] max_features=auto, loss=exponential, min_samples_split=10, n_estimators=100, max_depth=2 
[CV]  max_features=auto, loss=exponential, min_samples_split=10, n_estimators=100, max_depth=2, score=0.824138 -   0.0s
[CV] max_features=auto, loss=exponential, min_samples_split=10, n_estimators=100, max_depth=2 
[CV]  max_features=auto, loss=exponential, min_samples_split=10, n_estimators=100, max_depth=2, score=0.823529 -   0.0s
[CV] max_features=auto, loss=exponential, min_samples_split=25, n_estimators=10, max_depth=2 
[CV]  max_features=auto, loss=exponential, min_samples_split=25, n_estimators=10, max_depth=2, score=0.796552 -   0.0s
[CV] max_features=auto, loss=exponential, min_samples_split=25, n_estimators=10, max_depth=2 
[CV]  max_features=auto, loss=exponential, min_sa

[Parallel(n_jobs=1)]: Done 200 jobs       | elapsed:    6.3s
[Parallel(n_jobs=1)]: Done 221 jobs       | elapsed:    7.0s



[CV] max_features=auto, loss=exponential, min_samples_split=2, n_estimators=50, max_depth=3 
[CV]  max_features=auto, loss=exponential, min_samples_split=2, n_estimators=50, max_depth=3, score=0.806897 -   0.0s
[CV] max_features=auto, loss=exponential, min_samples_split=2, n_estimators=50, max_depth=3 
[CV]  max_features=auto, loss=exponential, min_samples_split=2, n_estimators=50, max_depth=3, score=0.816609 -   0.0s
[CV] max_features=auto, loss=exponential, min_samples_split=2, n_estimators=100, max_depth=3 
[CV]  max_features=auto, loss=exponential, min_samples_split=2, n_estimators=100, max_depth=3, score=0.789655 -   0.1s
[CV] max_features=auto, loss=exponential, min_samples_split=2, n_estimators=100, max_depth=3 
[CV]  max_features=auto, loss=exponential, min_samples_split=2, n_estimators=100, max_depth=3, score=0.820069 -   0.1s
[CV] max_features=auto, loss=exponential, min_samples_split=10, n_estimators=10, max_depth=3 
[CV]  max_features=auto, loss=exponential, min_samples_sp

[Parallel(n_jobs=1)]: Done 242 jobs       | elapsed:    7.6s
[Parallel(n_jobs=1)]: Done 265 jobs       | elapsed:    8.9s



[CV] max_features=auto, loss=exponential, min_samples_split=2, n_estimators=10, max_depth=5 
[CV]  max_features=auto, loss=exponential, min_samples_split=2, n_estimators=10, max_depth=5, score=0.806897 -   0.0s
[CV] max_features=auto, loss=exponential, min_samples_split=2, n_estimators=10, max_depth=5 
[CV]  max_features=auto, loss=exponential, min_samples_split=2, n_estimators=10, max_depth=5, score=0.816609 -   0.0s
[CV] max_features=auto, loss=exponential, min_samples_split=2, n_estimators=25, max_depth=5 
[CV]  max_features=auto, loss=exponential, min_samples_split=2, n_estimators=25, max_depth=5, score=0.806897 -   0.0s
[CV] max_features=auto, loss=exponential, min_samples_split=2, n_estimators=25, max_depth=5 
[CV]  max_features=auto, loss=exponential, min_samples_split=2, n_estimators=25, max_depth=5, score=0.833910 -   0.0s
[CV] max_features=auto, loss=exponential, min_samples_split=2, n_estimators=50, max_depth=5 
[CV]  max_features=auto, loss=exponential, min_samples_split=2

[Parallel(n_jobs=1)]: Done 288 jobs       | elapsed:   10.7s
[Parallel(n_jobs=1)]: Done 288 out of 288 | elapsed:   10.7s finished


GridSearchCV(cv=2, error_score='raise',
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'min_samples_split': [2, 10, 25], 'max_features': [None, 'auto'], 'n_estimators': [10, 25, 50, 100], 'max_depth': [2, 3, 5], 'loss': ['deviance', 'exponential']},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=10)

In [56]:
best_estimator = grid_search.best_estimator_
best_estimator.fit(df_train_x,df_train_y['Survived'])

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=25,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [57]:
train_score = best_estimator.score(df_train_x,df_train_y['Survived'])
test_score = best_estimator.score(df_test_x,df_test_y['Survived'])
print "train: ",train_score
print " test: ",test_score

train:  0.865284974093
 test:  0.814102564103


In [73]:
str(type(best_estimator)).split('.')[-1][0:-2]

'GradientBoostingClassifier'

## import test, score, output

In [58]:
test = pd.read_csv('test.csv')
test.set_index(test['PassengerId'],inplace=True)
clean_test = clean_titanic_data(test)

avg age:  30.2725903614
avg Fare:  35.6271884892


In [59]:
clean_test.head().T

PassengerId,892,893,894,895,896
Pclass,3.0,3,2.0,3.0,3.0
Sex,1.0,0,1.0,1.0,0.0
Age,34.5,47,62.0,27.0,22.0
SibSp,0.0,1,0.0,0.0,1.0
Parch,0.0,0,0.0,0.0,1.0
Fare,7.8292,7,9.6875,8.6625,12.2875
Embarked,0.0,1,0.0,1.0,1.0


In [60]:
output = best_estimator.predict_proba(clean_test)
output

array([[ 0.87167897,  0.12832103],
       [ 0.60262279,  0.39737721],
       [ 0.86064927,  0.13935073],
       [ 0.86064927,  0.13935073],
       [ 0.39372165,  0.60627835],
       [ 0.84162161,  0.15837839],
       [ 0.39818017,  0.60181983],
       [ 0.75097817,  0.24902183],
       [ 0.32149829,  0.67850171],
       [ 0.83378748,  0.16621252],
       [ 0.87167897,  0.12832103],
       [ 0.79084015,  0.20915985],
       [ 0.10136003,  0.89863997],
       [ 0.846886  ,  0.153114  ],
       [ 0.0857372 ,  0.9142628 ],
       [ 0.1050949 ,  0.8949051 ],
       [ 0.83729032,  0.16270968],
       [ 0.8221596 ,  0.1778404 ],
       [ 0.44235959,  0.55764041],
       [ 0.58000441,  0.41999559],
       [ 0.67168926,  0.32831074],
       [ 0.23751254,  0.76248746],
       [ 0.0857372 ,  0.9142628 ],
       [ 0.65045985,  0.34954015],
       [ 0.0837922 ,  0.9162078 ],
       [ 0.84800935,  0.15199065],
       [ 0.09909903,  0.90090097],
       [ 0.8221596 ,  0.1778404 ],
       [ 0.6944744 ,

In [61]:
survival = []
for x in output:
    survival.append(x[1])
survival_predictions = clean_test.copy()

In [62]:
survive_perc = pd.Series(survival, index=survival_predictions.index)
survival_predictions['survive_perc'] = survive_perc
survival_predictions['Survived'] = survival_predictions['survive_perc'].apply(lambda x: 1 if x>= 0.5 else 0)
survival_predictions['PassengerId'] = survival_predictions.index

In [63]:
survival_predictions.head().T

PassengerId,892,893,894,895,896
Pclass,3.0,3.0,2.0,3.0,3.0
Sex,1.0,0.0,1.0,1.0,0.0
Age,34.5,47.0,62.0,27.0,22.0
SibSp,0.0,1.0,0.0,0.0,1.0
Parch,0.0,0.0,0.0,0.0,1.0
Fare,7.8292,7.0,9.6875,8.6625,12.2875
Embarked,0.0,1.0,0.0,1.0,1.0
survive_perc,0.128321,0.397377,0.139351,0.139351,0.606278
Survived,0.0,0.0,0.0,0.0,1.0
PassengerId,892.0,893.0,894.0,895.0,896.0


In [64]:
output_df = survival_predictions[['PassengerId','Survived']]

In [65]:
output_df.head().T

PassengerId,892,893,894,895,896
PassengerId,892,893,894,895,896
Survived,0,0,0,0,1


In [66]:
output_df.to_csv('gradient2.csv', index=False)