In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score



In [47]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [48]:
print train.shape
print test.shape
print combineTrainTest.shape

(891, 12)
(418, 11)
(1309, 12)


In [49]:
def groupFamSize(size):
    if size == 1:
        return 1
    elif size > 1 and size < 5:
        return 2
    else:
        return 3

In [50]:
def binAge(age):
    if age <= 12.0:
        return 1
    elif age > 12.0 and age <= 18.0:
        return 2
    else:
        return 3

In [None]:
def binFare(fare):
    if fare <= 30:
        return 1
    elif fare > 30 and fare <= 50:
        return 2
    elif fare > 50 and fare <= 75:
        return 3
    else:
        return 4

In [51]:
def namesClean(train,test):
    for i in [train,test]:
        i['NameLen'] = i['Name'].apply(lambda x: len(x))
        i['Title'] = i['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split('.')[0])
        del i['Name']
    return train,test

In [52]:
# Clean titles
def ageClean(train,test):
    data = train.append(test)
    for i in [train,test]:
        titleMeanAge = data.groupby('Title').Age.mean()
        i.Age = i.apply(lambda x: titleMeanAge[x.Title] if pd.isnull(x.Age) else x.Age, axis=1)
        i['BinAge'] = i['Age'].map(binAge)
        del i['Age']
    return train,test

In [53]:
def famSizeClean(train,test):
    for i in [train,test]:
        i['FamSize'] = i['SibSp'] + i['Parch'] + 1
        i['FamSizeGroup'] = i['FamSize'].map(groupFamSize)
        del i['SibSp']
        del i['Parch']
    return train,test

In [54]:
def embarkClean(train,test):
    for i in [train,test]:
        i['Embarked'] = i['Embarked'].fillna('S')
    return train,test

In [55]:
def cabinClean(train,test):
    for i in [train,test]:
        i['Floor'] = i['Cabin'].apply(lambda x: str(x)[0])
        del i['Cabin']
    return train, test

In [56]:
def dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Title', 'Floor']):
    for column in columns:
        train[column] = train[column].apply(lambda x: str(x))
        test[column] = test[column].apply(lambda x: str(x))
        good_cols = [column+'_'+i for i in train[column].unique() if i in test[column].unique()]
        train = pd.concat((train, pd.get_dummies(train[column], prefix = column)[good_cols]), axis = 1)
        test = pd.concat((test, pd.get_dummies(test[column], prefix = column)[good_cols]), axis = 1)
        del train[column]
        del test[column]
    return train, test

In [57]:
def cleanColumns(train,test,columns = ['Ticket']):
    test['Fare'].fillna(train['Fare'].mean(), inplace = True)
    for column in columns:
        del train[column]
        del test[column]
    return train,test

In [58]:
train,test = namesClean(train,test)
train,test = ageClean(train,test)
train,test = famSizeClean(train,test)
train,test = embarkClean(train,test)
train,test = cabinClean(train,test)
train,test = dummies(train,test)
train,test = cleanColumns(train,test)

In [59]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
test

Unnamed: 0,PassengerId,Fare,NameLen,BinAge,FamSize,FamSizeGroup,Pclass_3,Pclass_1,Pclass_2,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,Title_ Mr,Title_ Mrs,Title_ Miss,Title_ Master,Title_ Rev,Title_ Dr,Title_ Ms,Title_ Col,Floor_n,Floor_C,Floor_E,Floor_G,Floor_D,Floor_A,Floor_B,Floor_F
0,892,7.8292,16,3,1,1,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,893,7.0,32,3,2,2,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,894,9.6875,25,3,1,1,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,895,8.6625,16,3,1,1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,896,12.2875,44,3,3,2,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,897,9.225,26,2,1,1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
6,898,7.6292,20,3,1,1,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
7,899,29.0,28,3,3,2,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8,900,7.2292,41,2,1,1,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9,901,24.15,23,3,3,2,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [60]:
featuresList = list(train.columns.values)
featuresList.remove('Survived')
featuresList.remove('PassengerId')
print featuresList

['Fare', 'NameLen', 'BinAge', 'FamSize', 'FamSizeGroup', 'Pclass_3', 'Pclass_1', 'Pclass_2', 'Sex_male', 'Sex_female', 'Embarked_S', 'Embarked_C', 'Embarked_Q', 'Title_ Mr', 'Title_ Mrs', 'Title_ Miss', 'Title_ Master', 'Title_ Rev', 'Title_ Dr', 'Title_ Ms', 'Title_ Col', 'Floor_n', 'Floor_C', 'Floor_E', 'Floor_G', 'Floor_D', 'Floor_A', 'Floor_B', 'Floor_F']


In [61]:
#split features and labels
features = list(train[featuresList].values)
labels = list(train['Survived'].values)

In [62]:
# Cross validation
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.20, random_state=42)

In [63]:
# Naive Bayes
nb = GaussianNB()
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb), ("NB", nb)])
params = {'SKB__k':range(2,8)}
split = StratifiedShuffleSplit(n_splits = 10, test_size=0.25, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
    
gs.fit(features,labels)
clf = gs.best_estimator_

print clf
print gs.best_score_

  f = msb / msw


Pipeline(steps=[('SKB', SelectKBest(k=7, score_func=<function f_classif at 0x000000000A4A8518>)), ('NB', GaussianNB(priors=None))])
0.787443946188


In [64]:
testFeatures = test[featuresList].values
#test['Survived'] = gs.predict(testFeatures).astype(int)

In [65]:
# dt = DecisionTreeClassifier(random_state = 1122)
# # skb = SelectKBest(f_classif)

# pipeline = Pipeline(steps=[#("SKB",skb),
#                            ("DT", dt)])
# params = {#'SKB__k':[5],
#          'DT__criterion':['gini'],
#          'DT__min_samples_split':range(2,40)}

# split = StratifiedShuffleSplit(n_splits = 100, test_size=0.25, random_state=42)

# gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
    
# gs.fit(features,labels)
# clf = gs.best_estimator_

# print clf
# print gs.best_score_

clf2 = DecisionTreeClassifier(min_samples_split = 35, random_state = 1122)
clf2.fit(features_train,labels_train)
pred2 = clf2.predict(features_test)

print accuracy_score(pred2,labels_test)
clf2.feature_importances_

0.832402234637


array([ 0.12493764,  0.10105568,  0.        ,  0.00129034,  0.        ,
        0.14222432,  0.00154948,  0.        ,  0.        ,  0.00090965,
        0.0109292 ,  0.00143946,  0.00363162,  0.51799655,  0.        ,
        0.        ,  0.        ,  0.04317371,  0.0149076 ,  0.        ,
        0.        ,  0.03595475,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ])

In [66]:
test['Survived'] = clf2.predict(testFeatures).astype(int)

In [67]:
rf = RandomForestClassifier(max_features='auto',oob_score = True, random_state = 1122, n_jobs = -1)
params = {'criterion':['gini'],
         'min_samples_split':[2,10,15,20,25,30],
         'min_samples_leaf':[1,5,10],
         'n_estimators':[50,100,400,1000]}

gs = GridSearchCV(estimator = rf, param_grid = params, cv = 3, scoring = 'accuracy')
    
gs.fit(features,labels)
clf = gs.best_estimator_

print gs.best_params_
print clf
print gs.best_score_

# clf4 = RandomForestClassifier(n_estimators = 1500, 
#                               min_samples_split = 25,
#                               random_state = 24)
# clf4.fit(features_train, labels_train)
# pred4 = clf4.predict(features_test)

# print accuracy_score(pred4, labels_test)

{'min_samples_split': 30, 'n_estimators': 1000, 'criterion': 'gini', 'min_samples_leaf': 1}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=30, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=True,
            random_state=1122, verbose=0, warm_start=False)
0.832772166105


In [68]:
test['Survived'] = gs.predict(testFeatures).astype(int)

In [69]:
final = test[['PassengerId','Survived']]
final.to_csv('prediction.csv', index=False)