In [1]:
import os
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression, \
    PassiveAggressiveClassifier, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, \
    GradientBoostingClassifier, ExtraTreesClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

In [2]:
train = pd.read_csv('train.csv')
train.set_index(train['PassengerId'],inplace=True)
train.head().T

PassengerId,1,2,3,4,5
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


## clean up the data

In [3]:
print len(train)
print sum(list(train.groupby('Sex').size()))
train.groupby('Sex').size()

891
891


Sex
female    314
male      577
dtype: int64

In [4]:
def clean_titanic_data(data_df):
    df = data_df.copy()
    #clean embarked
    df['Embarked'] = df['Embarked'].apply(lambda x: 0 if x=='Q' else x)
    df['Embarked'] = df['Embarked'].apply(lambda x: 1 if x=='S' else x)
    df['Embarked'] = df['Embarked'].apply(lambda x: 2 if x=='C' else x)
    df['Embarked'] = df['Embarked'].fillna('3')
    #clean age by pClass
    age1 = np.average(df[(df['Pclass']==1) & df['Age'].notnull()]['Age'])
    age2 = np.average(df[(df['Pclass']==2) & df['Age'].notnull()]['Age'])
    age3 = np.average(df[(df['Pclass']==3) & df['Age'].notnull()]['Age'])
    print "avg Age1: ",age1
    print "avg Age2: ",age2
    print "avg Age3: ",age3
    for passenger in df[(df['Pclass']==1) & df['Age'].isnull()].index:
        df.loc[passenger, 'Age'] = age1
    for passenger in df[(df['Pclass']==2) & df['Age'].isnull()].index:
        df.loc[passenger, 'Age'] = age2
    for passenger in df[(df['Pclass']==3) & df['Age'].isnull()].index:
        df.loc[passenger, 'Age'] = age3
    #clean age
#     avg_age = np.average(df[(df['Age'].notnull())]['Age'])
#     print "avg age: ",avg_age
#     df['Age'] = df['Age'].fillna(avg_age)
    #clean fare by pClass
#     fare1 = np.average(df[(df['Pclass']==1) & df['Fare'].notnull()]['Fare'])
#     fare2 = np.average(df[(df['Pclass']==2) & df['Fare'].notnull()]['Fare'])
#     fare3 = np.average(df[(df['Pclass']==3) & df['Fare'].notnull()]['Fare'])
#     print "avg Fare1: ",fare1
#     print "avg Fare2: ",fare2
#     print "avg Fare3: ",fare3
#     for passenger in df[(df['Pclass']==1) & df['Fare'].isnull()].index:
#         df.loc[passenger, 'Fare'] = fare1
#     for passenger in df[(df['Pclass']==2) & df['Fare'].isnull()].index:
#         df.loc[passenger, 'Fare'] = fare2
#     for passenger in df[(df['Pclass']==3) & df['Fare'].isnull()].index:
#         df.loc[passenger, 'Fare'] = fare3
    #clean fare
    avg_fare = np.average(df[(df['Fare'].notnull())]['Fare'])
    print "avg fare: ",avg_fare
    df['Fare'] = df['Fare'].fillna(avg_fare)
    #turn sex to number
    df['Sex'] = df['Sex'].apply(lambda x: 0 if x=='female' else 1)
    df['Sex'] = df['Sex'].fillna(2)
    #drop other stuff while testing
    df.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)
#     df.drop(['Name', 'PassengerId', 'Ticket', 'Cabin','Embarked'], axis=1, inplace=True)
#     df.drop(['Name', 'PassengerId', 'Ticket', 'Cabin','Embarked','Pclass'], axis=1, inplace=True)
    
    return df
    

In [5]:
clean_train = clean_titanic_data(train)

avg Age1:  38.2334408602
avg Age2:  29.8776300578
avg Age3:  25.1406197183
avg fare:  32.2042079686


In [6]:
clean_train.head().T

PassengerId,1,2,3,4,5
Survived,0.0,1.0,1.0,1.0,0.0
Pclass,3.0,1.0,3.0,1.0,3.0
Sex,1.0,0.0,0.0,0.0,1.0
Age,22.0,38.0,26.0,35.0,35.0
SibSp,1.0,1.0,0.0,1.0,0.0
Parch,0.0,0.0,0.0,0.0,0.0
Fare,7.25,71.2833,7.925,53.1,8.05
Embarked,1.0,2.0,1.0,1.0,1.0


In [7]:
test = pd.read_csv('test.csv')
test.set_index(test['PassengerId'],inplace=True)
clean_test = clean_titanic_data(test)
survival_predictions = clean_test.copy()

avg Age1:  40.9183673469
avg Age2:  28.7775
avg Age3:  24.0279452055
avg fare:  35.6271884892


## test out some stuff

In [8]:
clean_train_y = clean_train[[0]]
clean_train.drop(['Survived'], axis=1, inplace=True)

In [9]:
models = [{'estimator': GradientBoostingClassifier(),
            'param_grid': dict(loss = ['deviance', 'exponential'], 
                              n_estimators = [10,25,50,100], 
                              max_features = [None,'auto'], 
                              max_depth = [2,3,5,7,9], 
                              min_samples_split = [2,5,10,25]
                 )},
          {'estimator': RandomForestClassifier(),
            'param_grid': dict(n_estimators = [10,25,50,100], 
                               max_features=[None], 
                               class_weight = ['subsample','auto'],
                               criterion = ['gini','entropy'], 
                               max_depth = [2,3,5,7,9], 
                               min_samples_split = [2,5,10,25]
                )},
          {'estimator': ExtraTreesClassifier(),
            'param_grid': dict(n_estimators = [10,25,50,100], 
                               max_features=[None], 
                               class_weight = ['subsample','auto'],
                               criterion = ['gini','entropy'], 
                               max_depth = [2,3,5,7,9], 
                               min_samples_split = [2,5,10,25]
                 )},
          {'estimator': AdaBoostClassifier(),
            'param_grid': dict(n_estimators = [10,25,50,100], 
                               algorithm = ['SAMME','SAMME.R']
                 )},
          {'estimator': LogisticRegression(),
            'param_grid': dict(penalty = ['l1','l2'], 
                               C=[0.01,0.1,1.0], 
                               class_weight = [None,'auto'],
                               solver = ['newton-cg', 'lbfgs'], 
                               multi_class = ['ovr','multinomial']
                 )},
          {'estimator': DecisionTreeClassifier(),
            'param_grid': dict(criterion = ['gini','entropy'],
                               max_features=[None], 
                               class_weight = [None,'auto'],
                               max_depth = [2,3,5,7,9], 
                               min_samples_split = [2,5,10,25],
                               splitter = ['best','random']
                 )},
         ]

In [10]:
columns = []
for model in models:
    #make new test/train sets
    df_train_index, df_test_index = train_test_split(clean_train.index, test_size=0.35)
    df_test_x = clean_train.loc[df_test_index]
    df_train_x = clean_train.loc[df_train_index]
    df_test_y = clean_train_y.loc[df_test_index]
    df_train_y = clean_train_y.loc[df_train_index]
    
    #run grid search for each type
    grid_search = GridSearchCV(model['estimator'], param_grid = model['param_grid'], cv = 2, n_jobs=1, verbose=0)
    grid_search.fit(df_train_x,df_train_y['Survived'])
    
    #grab the best estimator and re-fit to it
    best_estimator = grid_search.best_estimator_
    best_estimator.fit(df_train_x,df_train_y['Survived'])
    model_name = str(type(best_estimator)).split('.')[-1][0:-2]
    
    #how did we do?
    print model_name," results:"
    train_score = best_estimator.score(df_train_x,df_train_y['Survived'])
    test_score = best_estimator.score(df_test_x,df_test_y['Survived'])
    print "train: ",train_score
    print " test: ",test_score
    print best_estimator,"\n\n"
    
    #grab the output and add it to our test dataframe
    if test_score >= 0.8:
        output = best_estimator.predict_proba(clean_test)
        survival = []
        for x in output:
            survival.append(x[1])
        column_name = model_name+"_survive_perc"
        columns.append(column_name)
        survival_predictions[column_name] = pd.Series(survival, index=survival_predictions.index)

GradientBoostingClassifier  results:
train:  0.844559585492
 test:  0.839743589744
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=2, max_features='auto', max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=10,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False) 


RandomForestClassifier  results:
train:  0.829015544041
 test:  0.820512820513
RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=3, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=25,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 


ExtraTreesClassifier  results:
train:  0.911917098446
 test:  0.775641025641
ExtraTreesClassifier(bootstrap=False, class_weigh

In [11]:
survival_predictions.head().T

PassengerId,892,893,894,895,896
Pclass,3.0,3.0,2.0,3.0,3.0
Sex,1.0,0.0,1.0,1.0,0.0
Age,34.5,47.0,62.0,27.0,22.0
SibSp,0.0,1.0,0.0,0.0,1.0
Parch,0.0,0.0,0.0,0.0,1.0
Fare,7.8292,7.0,9.6875,8.6625,12.2875
Embarked,0.0,1.0,0.0,1.0,1.0
GradientBoostingClassifier_survive_perc,0.115556,0.366967,0.16845,0.137117,0.484698
RandomForestClassifier_survive_perc,0.150922,0.541643,0.150922,0.150922,0.67445
AdaBoostClassifier_survive_perc,0.432939,0.511835,0.45977,0.432939,0.521524


In [12]:
survival_predictions['merged_perc'] = survival_predictions[columns].mean(axis=1)
survival_predictions['Survived'] = survival_predictions['merged_perc'].apply(lambda x: 1 if x>= 0.5 else 0)
survival_predictions['PassengerId'] = survival_predictions.index

In [13]:
survival_predictions.head().T

PassengerId,892,893,894,895,896
Pclass,3.0,3.0,2.0,3.0,3.0
Sex,1.0,0.0,1.0,1.0,0.0
Age,34.5,47.0,62.0,27.0,22.0
SibSp,0.0,1.0,0.0,0.0,1.0
Parch,0.0,0.0,0.0,0.0,1.0
Fare,7.8292,7.0,9.6875,8.6625,12.2875
Embarked,0.0,1.0,0.0,1.0,1.0
GradientBoostingClassifier_survive_perc,0.115556,0.366967,0.16845,0.137117,0.484698
RandomForestClassifier_survive_perc,0.150922,0.541643,0.150922,0.150922,0.67445
AdaBoostClassifier_survive_perc,0.432939,0.511835,0.45977,0.432939,0.521524


## import test, score, output

In [14]:
output_df = survival_predictions[['PassengerId','Survived']]
output_df.head().T

PassengerId,892,893,894,895,896
PassengerId,892,893,894,895,896
Survived,0,0,0,0,1


In [15]:
output_df.to_csv('ensemble_with_gridsearch7.csv', index=False)