In [70]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

In [101]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [87]:
print train.shape
print test.shape

(891, 12)
(418, 11)


In [88]:
def groupFamSize(size):
    if size == 1:
        return 1
    elif size > 1 and size < 5:
        return 2
    else:
        return 3

In [89]:
def binAge(age):
    if age <= 12.0:
        return 1
    elif age > 12.0 and age <= 18.0:
        return 2
    else:
        return 3

In [90]:
def binFare(fare):
    if fare <= 30:
        return 1
    elif fare > 30 and fare <= 50:
        return 2
    elif fare > 50 and fare <= 75:
        return 3
    else:
        return 4

In [91]:
def namesClean(train,test):
    for i in [train,test]:
        i['NameLen'] = i['Name'].apply(lambda x: len(x))
        i['Title'] = i['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split('.')[0])
        del i['Name']
    return train,test

In [92]:
# Clean titles
def ageClean(train,test):
    data = train.append(test)
    for i in [train,test]:
        titleMeanAge = data.groupby('Title').Age.mean()
        i.Age = i.apply(lambda x: titleMeanAge[x.Title] if pd.isnull(x.Age) else x.Age, axis=1)
        i['BinAge'] = i['Age'].map(binAge)
        del i['Age']
    return train,test

In [102]:
def fareClean(train,test):
    avgFare = train['Fare'].mean()
    for i in [train,test]:
        i['Fare'].fillna(avgFare, inplace = True)
        i['BinFare'] = i['Fare'].map(binFare)
        del i['Fare']
    return train,test

In [94]:
def famSizeClean(train,test):
    for i in [train,test]:
        i['FamSize'] = i['SibSp'] + i['Parch'] + 1
        i['FamSizeGroup'] = i['FamSize'].map(groupFamSize)
        del i['SibSp']
        del i['Parch']
    return train,test

In [95]:
def embarkClean(train,test):
    for i in [train,test]:
        i['Embarked'] = i['Embarked'].fillna('S')
    return train,test

In [96]:
def cabinClean(train,test):
    for i in [train,test]:
        i['Floor'] = i['Cabin'].apply(lambda x: str(x)[0])
        del i['Cabin']
    return train, test

In [97]:
def dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Title', 'Floor']):
    for column in columns:
        train[column] = train[column].apply(lambda x: str(x))
        test[column] = test[column].apply(lambda x: str(x))
        good_cols = [column+'_'+i for i in train[column].unique() if i in test[column].unique()]
        train = pd.concat((train, pd.get_dummies(train[column], prefix = column)[good_cols]), axis = 1)
        test = pd.concat((test, pd.get_dummies(test[column], prefix = column)[good_cols]), axis = 1)
        del train[column]
        del test[column]
    return train, test

In [98]:
def cleanColumns(train,test,columns = ['Ticket']):
    for column in columns:
        del train[column]
        del test[column]
    return train,test

In [103]:
train,test = namesClean(train,test)
train,test = ageClean(train,test)
train,test = famSizeClean(train,test)
train,test = embarkClean(train,test)
train,test = fareClean(train,test)
train,test = cabinClean(train,test)
train,test = dummies(train,test)
train,test = cleanColumns(train,test)

In [104]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
test

Unnamed: 0,PassengerId,NameLen,BinAge,FamSize,FamSizeGroup,BinFare,Pclass_3,Pclass_1,Pclass_2,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,Title_ Mr,Title_ Mrs,Title_ Miss,Title_ Master,Title_ Rev,Title_ Dr,Title_ Ms,Title_ Col,Floor_n,Floor_C,Floor_E,Floor_G,Floor_D,Floor_A,Floor_B,Floor_F
0,892,16,3,1,1,1,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,893,32,3,2,2,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,894,25,3,1,1,1,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,895,16,3,1,1,1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,896,44,3,3,2,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,897,26,2,1,1,1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
6,898,20,3,1,1,1,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
7,899,28,3,3,2,1,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8,900,41,2,1,1,1,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9,901,23,3,3,2,1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [105]:
featuresList = list(train.columns.values)
featuresList.remove('Survived')
featuresList.remove('PassengerId')
print featuresList

['NameLen', 'BinAge', 'FamSize', 'FamSizeGroup', 'BinFare', 'Pclass_3', 'Pclass_1', 'Pclass_2', 'Sex_male', 'Sex_female', 'Embarked_S', 'Embarked_C', 'Embarked_Q', 'Title_ Mr', 'Title_ Mrs', 'Title_ Miss', 'Title_ Master', 'Title_ Rev', 'Title_ Dr', 'Title_ Ms', 'Title_ Col', 'Floor_n', 'Floor_C', 'Floor_E', 'Floor_G', 'Floor_D', 'Floor_A', 'Floor_B', 'Floor_F']


In [106]:
#split features and labels
features = list(train[featuresList].values)
labels = list(train['Survived'].values)

In [107]:
# Cross validation
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.20, random_state=42)

In [108]:
# Naive Bayes
nb = GaussianNB()
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb), ("NB", nb)])
params = {'SKB__k':range(2,8)}
split = StratifiedShuffleSplit(n_splits = 10, test_size=0.25, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
    
gs.fit(features,labels)
clf = gs.best_estimator_

print clf
print gs.best_score_

Pipeline(steps=[('SKB', SelectKBest(k=6, score_func=<function f_classif at 0x000000000A4A8518>)), ('NB', GaussianNB(priors=None))])
0.785650224215


In [109]:
testFeatures = test[featuresList].values
#test['Survived'] = gs.predict(testFeatures).astype(int)

In [110]:
# dt = DecisionTreeClassifier(random_state = 1122)
# # skb = SelectKBest(f_classif)

# pipeline = Pipeline(steps=[#("SKB",skb),
#                            ("DT", dt)])
# params = {#'SKB__k':[5],
#          'DT__criterion':['gini'],
#          'DT__min_samples_split':range(2,40)}

# split = StratifiedShuffleSplit(n_splits = 100, test_size=0.25, random_state=42)

# gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
    
# gs.fit(features,labels)
# clf = gs.best_estimator_

# print clf
# print gs.best_score_

clf2 = DecisionTreeClassifier(min_samples_split = 35, random_state = 1122)
clf2.fit(features_train,labels_train)
pred2 = clf2.predict(features_test)

print accuracy_score(pred2,labels_test)
clf2.feature_importances_

0.826815642458


array([  9.48646747e-02,   1.80513027e-02,   8.69197156e-02,
         4.09975883e-04,   2.61273952e-03,   1.45688635e-01,
         0.00000000e+00,   1.14511974e-03,   0.00000000e+00,
         0.00000000e+00,   1.00210898e-02,   1.45300756e-03,
         1.10018512e-03,   5.22871162e-01,   7.05629028e-03,
         0.00000000e+00,   0.00000000e+00,   4.35799966e-02,
         1.50478852e-02,   0.00000000e+00,   0.00000000e+00,
         3.62931044e-02,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.94212768e-03,
         0.00000000e+00,   6.94298832e-03])

In [111]:
test['Survived'] = clf2.predict(testFeatures).astype(int)

In [112]:
rf = RandomForestClassifier(max_features='auto',oob_score = True, random_state = 1122, n_jobs = -1)
params = {'criterion':['gini'],
         'min_samples_split':[2,10,15,20,25,30],
         'min_samples_leaf':[1,5,10],
         'n_estimators':[50,100,400,1000]}

gs = GridSearchCV(estimator = rf, param_grid = params, cv = 3, scoring = 'accuracy')
    
gs.fit(features,labels)
clf = gs.best_estimator_

print gs.best_params_
print clf
print gs.best_score_

# clf4 = RandomForestClassifier(n_estimators = 1500, 
#                               min_samples_split = 25,
#                               random_state = 24)
# clf4.fit(features_train, labels_train)
# pred4 = clf4.predict(features_test)

# print accuracy_score(pred4, labels_test)

{'min_samples_split': 30, 'n_estimators': 1000, 'criterion': 'gini', 'min_samples_leaf': 1}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=30, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=True,
            random_state=1122, verbose=0, warm_start=False)
0.832772166105


In [113]:
test['Survived'] = gs.predict(testFeatures).astype(int)

In [114]:
final = test[['PassengerId','Survived']]
final.to_csv('prediction.csv', index=False)