In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score



In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
combineTrainTest = train.append(test)

In [4]:
print train.shape
print test.shape
print combineTrainTest.shape

(891, 12)
(418, 11)
(1309, 12)


In [5]:
def preprocessing(data):
    data = data.fillna(0)
    
    data['FamSize'] = data['SibSp'] + data['Parch']
    
    # Create title feature
    data['Title'] = data['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split('.')[0])
    data.drop(['Name','Ticket','Cabin','Parch','SibSp'], axis=1, inplace=True)
    # Remove Name, Ticket, Cabin Column
    
    data["Title"] = data["Title"].astype('category')
    data["Title"] = data["Title"].cat.codes
    data["Sex"] = data["Sex"].astype('category')
    data["Sex"] = data["Sex"].cat.codes
    data["Embarked"] = data["Embarked"].astype('category')
    data["Embarked"] = data["Embarked"].cat.codes
    #categoricalFeatures = ['Sex','Embarked', 'Title']
    #data = pd.get_dummies(data,columns = categoricalFeatures) # Create categorical columns
    
    return data

In [6]:
combineTrainTest = preprocessing(combineTrainTest)

In [7]:
train_Post = combineTrainTest[0:891]
test_Post = combineTrainTest[891:]

In [8]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
pd.set_option('display.max_columns',None)
train_Post.head()

Unnamed: 0,Age,Embarked,Fare,PassengerId,Pclass,Sex,Survived,FamSize,Title
0,22.0,3,7.25,1,3,1,0.0,1,12
1,38.0,1,71.2833,2,1,0,1.0,1,13
2,26.0,3,7.925,3,3,0,1.0,0,9
3,35.0,3,53.1,4,1,0,1.0,1,13
4,35.0,3,8.05,5,3,1,0.0,0,12


In [10]:
featuresList = list(train_Post.columns.values)
featuresList.remove('Survived')
featuresList.remove('PassengerId')

In [11]:
#split features and labels
features = list(train_Post[featuresList].values)
labels = list(train_Post['Survived'].values)

In [172]:
# Cross validation
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [22]:
# Naive Bayes
nb = GaussianNB()
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb), ("NB", nb)])
params = {'SKB__k':range(2,8)}
split = StratifiedShuffleSplit(n_splits = 10, test_size=0.25, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
    
gs.fit(features,labels)
clf = gs.best_estimator_

print clf
print gs.best_score_

Pipeline(steps=[('SKB', SelectKBest(k=2, score_func=<function f_classif at 0x000000000A06D518>)), ('NB', GaussianNB(priors=None))])
0.784753363229


In [31]:
testFeatures = test_Post[featuresList].values
test['Survived'] = clf.predict(testFeatures).astype(int)

In [29]:
dt = DecisionTreeClassifier(random_state = 1122)
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb), ("DT", dt)])
params = {'SKB__k':range(2,8),
         'DT__min_samples_split':range(2,20)}

split = StratifiedShuffleSplit(n_splits = 10, test_size=0.25, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
    
gs.fit(features,labels)
clf = gs.best_estimator_

print clf
print gs.best_score_

# clf2 = DecisionTreeClassifier(min_samples_split = 30, random_state = 1122)
# clf2.fit(features_train,labels_train)
# pred2 = clf2.predict(features_test)

# print accuracy_score(pred2,labels_test)
# clf2.feature_importances_

Pipeline(steps=[('SKB', SelectKBest(k=5, score_func=<function f_classif at 0x000000000A06D518>)), ('DT', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1122, splitter='best'))])
0.822421524664


In [32]:
test['Survived'] = gs.predict(testFeatures).astype(int)

In [27]:
ab = AdaBoostClassifier(random_state = 24)
dt = DecisionTreeClassifier()
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb), ("AB", ab)])
params = {'SKB__k':range(2,8),
         'AB__n_estimators':[150,1000]}

split = StratifiedShuffleSplit(n_splits = 10, test_size=0.25, random_state=42)
gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
   
gs.fit(features,labels)
clf = gs.best_estimator_

print clf
print gs.best_score_

Pipeline(steps=[('SKB', SelectKBest(k=6, score_func=<function f_classif at 0x000000000A06D518>)), ('AB', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=150, random_state=24))])
0.811659192825


In [113]:
test['Survived'] = clf3.predict(testFeatures).astype(int)

In [147]:
clf4 = RandomForestClassifier(n_estimators = 1500, max_features = .5,
                              min_samples_split = 15, random_state = 24)
clf4.fit(features_train, labels_train)
pred4 = clf4.predict(features_test)

print accuracy_score(pred4, labels_test)

0.829596412556


In [140]:
test['Survived'] = clf4.predict(testFeatures).astype(int)

In [141]:
final = test[['PassengerId','Survived']]
final.to_csv('prediction.csv', index=False)