In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score



In [98]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [99]:
combineTrainTest = train.append(test)

In [100]:
print train.shape
print test.shape
print combineTrainTest.shape

(891, 12)
(418, 11)
(1309, 12)


In [101]:
def groupFamSize(size):
    if size == 1:
        return 1
    elif size > 1 and size < 5:
        return 2
    else:
        return 3

In [102]:
def preprocessing(data):
    # Group by fam size
    data['FamSize'] = data['SibSp'] + data['Parch'] + 1
    data['FamSizeGroup'] = data['FamSize'].map(groupFamSize)
    
    # Create title feature
    data['Title'] = data['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split('.')[0])
    
    data.drop(['Name','Ticket','Cabin','Parch','SibSp'], axis=1, inplace=True)
    # Remove Name, Ticket, Cabin Column
    
    titleMeanAge = data.groupby('Title').Age.mean()
    data.Age = data.apply(lambda x: titleMeanAge[x.Title] if pd.isnull(x.Age) else x.Age, axis=1)
    
    data['Embarked'] = data['Embarked'].fillna('S')
    data = data.fillna(0)
    
    data["Title"] = data["Title"].astype('category')
    data["Title"] = data["Title"].cat.codes
    data["Sex"] = data["Sex"].astype('category')
    data["Sex"] = data["Sex"].cat.codes
    data["Embarked"] = data["Embarked"].astype('category')
    data["Embarked"] = data["Embarked"].cat.codes
    
    return data

In [103]:
combineTrainTest = preprocessing(combineTrainTest)

In [104]:
pd.set_option('display.max_rows',None)
combineTrainTest

Unnamed: 0,Age,Embarked,Fare,PassengerId,Pclass,Sex,Survived,FamSize,FamSizeGroup,Title
0,22.0,2,7.25,1,3,1,0.0,2,2,12
1,38.0,0,71.2833,2,1,0,1.0,2,2,13
2,26.0,2,7.925,3,3,0,1.0,1,1,9
3,35.0,2,53.1,4,1,0,1.0,2,2,13
4,35.0,2,8.05,5,3,1,0.0,1,1,12
5,32.252151,1,8.4583,6,3,1,0.0,1,1,12
6,54.0,2,51.8625,7,1,1,0.0,1,1,12
7,2.0,2,21.075,8,3,1,0.0,5,3,8
8,27.0,2,11.1333,9,3,0,1.0,3,2,13
9,14.0,0,30.0708,10,2,0,1.0,2,2,13


In [105]:
train_Post = combineTrainTest[0:891]
test_Post = combineTrainTest[891:]

In [106]:
pd.set_option('display.max_columns',None)
train_Post.head()

Unnamed: 0,Age,Embarked,Fare,PassengerId,Pclass,Sex,Survived,FamSize,FamSizeGroup,Title
0,22.0,2,7.25,1,3,1,0.0,2,2,12
1,38.0,0,71.2833,2,1,0,1.0,2,2,13
2,26.0,2,7.925,3,3,0,1.0,1,1,9
3,35.0,2,53.1,4,1,0,1.0,2,2,13
4,35.0,2,8.05,5,3,1,0.0,1,1,12


In [107]:
featuresList = list(train_Post.columns.values)
featuresList.remove('Survived')
featuresList.remove('PassengerId')
print featuresList

['Age', 'Embarked', 'Fare', 'Pclass', 'Sex', 'FamSize', 'FamSizeGroup', 'Title']


In [108]:
#split features and labels
features = list(train_Post[featuresList].values)
labels = list(train_Post['Survived'].values)

In [109]:
# Cross validation
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [110]:
# Naive Bayes
nb = GaussianNB()
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb), ("NB", nb)])
params = {'SKB__k':range(2,8)}
split = StratifiedShuffleSplit(n_splits = 10, test_size=0.25, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
    
gs.fit(features,labels)
clf = gs.best_estimator_

print clf
print gs.best_score_

Pipeline(steps=[('SKB', SelectKBest(k=2, score_func=<function f_classif at 0x000000000A136518>)), ('NB', GaussianNB(priors=None))])
0.784753363229


In [111]:
testFeatures = test_Post[featuresList].values
test['Survived'] = gs.predict(testFeatures).astype(int)

In [122]:
# dt = DecisionTreeClassifier(random_state = 1122)
# # skb = SelectKBest(f_classif)

# pipeline = Pipeline(steps=[#("SKB",skb),
#                            ("DT", dt)])
# params = {#'SKB__k':[5],
#          'DT__criterion':['gini'],
#          'DT__min_samples_split':range(2,40)}

# split = StratifiedShuffleSplit(n_splits = 100, test_size=0.25, random_state=42)

# gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
    
# gs.fit(features,labels)
# clf = gs.best_estimator_

# print clf
# print gs.best_score_

clf2 = DecisionTreeClassifier(min_samples_split = 20, random_state = 1122)
clf2.fit(features_train,labels_train)
pred2 = clf2.predict(features_test)

print accuracy_score(pred2,labels_test)
clf2.feature_importances_

0.820627802691


array([ 0.1367716 ,  0.02235788,  0.1599296 ,  0.16411939,  0.45655196,
        0.01911145,  0.03622786,  0.00493027])

In [123]:
test['Survived'] = clf2.predict(testFeatures).astype(int)

In [None]:
rf = RandomForestClassifier(max_features='auto',oob_score = True, random_state = 1122, n_jobs = -1)
params = {'criterion':['gini','entropy'],
         'min_samples_split':[2,10,15,20,25,30],
         'min_samples_leaf':[1,5,10],
         'n_estimators':[50, 100, 400]}

gs = GridSearchCV(estimator = rf, param_grid = params, cv = 3, scoring = 'accuracy')
    
gs.fit(features,labels)
clf = gs.best_estimator_

print gs.bestparams
print gs.clf

# clf4 = RandomForestClassifier(n_estimators = 1500, 
#                               min_samples_split = 25,
#                               random_state = 24)
# clf4.fit(features_train, labels_train)
# pred4 = clf4.predict(features_test)

# print accuracy_score(pred4, labels_test)

In [120]:
test['Survived'] = gs.predict(testFeatures).astype(int)

In [124]:
final = test[['PassengerId','Survived']]
final.to_csv('prediction.csv', index=False)