In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score



In [97]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [98]:
combineTrainTest = train.append(test)

In [99]:
print train.shape
print test.shape
print combineTrainTest.shape

(891, 12)
(418, 11)
(1309, 12)


In [100]:
def preprocessing(data):
    data = data.fillna(0)
    
    data['FamSize'] = data['SibSp'] + data['Parch']
    
    # Create title feature
    data['Title'] = data['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split('.')[0])
    data.drop(['Name','Ticket','Cabin','Parch','SibSp'], axis=1, inplace=True)
    # Remove Name, Ticket, Cabin Column
    
    data["Title"] = data["Title"].astype('category')
    data["Title"] = data["Title"].cat.codes
    data["Sex"] = data["Sex"].astype('category')
    data["Sex"] = data["Sex"].cat.codes
    data["Embarked"] = data["Embarked"].astype('category')
    data["Embarked"] = data["Embarked"].cat.codes
    #categoricalFeatures = ['Sex','Embarked', 'Title']
    #data = pd.get_dummies(data,columns = categoricalFeatures) # Create categorical columns
    
    return data

In [101]:
combineTrainTest = preprocessing(combineTrainTest)

In [102]:
train_Post = combineTrainTest[0:891]
test_Post = combineTrainTest[891:]

In [103]:
pd.set_option('display.max_columns',None)
train_Post.head()

Unnamed: 0,Age,Embarked,Fare,PassengerId,Pclass,Sex,Survived,FamSize,Title
0,22.0,3,7.25,1,3,1,0.0,1,12
1,38.0,1,71.2833,2,1,0,1.0,1,13
2,26.0,3,7.925,3,3,0,1.0,0,9
3,35.0,3,53.1,4,1,0,1.0,1,13
4,35.0,3,8.05,5,3,1,0.0,0,12


In [104]:
featuresList = list(train_Post.columns.values)
featuresList.remove('Survived')
featuresList.remove('PassengerId')

In [105]:
#split features and labels
features = train_Post[featuresList].values
labels = train_Post['Survived'].values

In [106]:
# Cross validation
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [107]:
# Naive Bayes
clf = GaussianNB()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred,labels_test)

0.780269058296


In [108]:
testFeatures = test_Post[featuresList].values
test['Survived'] = clf.predict(testFeatures).astype(int)

In [119]:
clf2 = DecisionTreeClassifier(min_samples_split = 30, random_state = 1122)
clf2.fit(features_train,labels_train)
pred2 = clf2.predict(features_test)

print accuracy_score(pred2,labels_test)

0.838565022422


In [120]:
clf2.feature_importances_

array([ 0.05110728,  0.00313222,  0.21114151,  0.13193582,  0.47919583,
        0.06668009,  0.05680726])

In [121]:
test['Survived'] = clf2.predict(testFeatures).astype(int)

In [112]:
clf3 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                          n_estimators=1000,
                          learning_rate=0.1,
                          random_state=1122)
clf3.fit(features_train,labels_train)
pred3 = clf3.predict(features_test)

print accuracy_score(pred3, labels_test)

0.816143497758


In [113]:
test['Survived'] = clf3.predict(testFeatures).astype(int)

In [147]:
clf4 = RandomForestClassifier(n_estimators = 1500, max_features = .5,
                              min_samples_split = 15, random_state = 24)
clf4.fit(features_train, labels_train)
pred4 = clf4.predict(features_test)

print accuracy_score(pred4, labels_test)

0.829596412556


In [140]:
test['Survived'] = clf4.predict(testFeatures).astype(int)

In [141]:
final = test[['PassengerId','Survived']]
final.to_csv('prediction.csv', index=False)