In [47]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

In [48]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [49]:
combineTrainTest = train.append(test)

In [50]:
print train.shape
print test.shape
print combineTrainTest.shape

(891, 12)
(418, 11)
(1309, 12)


In [51]:
def preprocessing(data):
    
    data['FamSize'] = data['SibSp'] + data['Parch']
    
    # Create title feature
    data['Title'] = data['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split('.')[0])
    data.drop(['Name','Ticket','Cabin','Parch','SibSp'], axis=1, inplace=True)
    # Remove Name, Ticket, Cabin Column
    
    titleMeanAge = data.groupby('Title').Age.mean()
    data.Age = data.apply(lambda x: titleMeanAge[x.Title] if pd.isnull(x.Age) else x.Age, axis=1)
    
    data = data.fillna(0)
    
    data["Title"] = data["Title"].astype('category')
    data["Title"] = data["Title"].cat.codes
    data["Sex"] = data["Sex"].astype('category')
    data["Sex"] = data["Sex"].cat.codes
    data["Embarked"] = data["Embarked"].astype('category')
    data["Embarked"] = data["Embarked"].cat.codes
    #categoricalFeatures = ['Sex','Embarked', 'Title']
    #data = pd.get_dummies(data,columns = categoricalFeatures) # Create categorical columns
    
    return data

In [52]:
combineTrainTest = preprocessing(combineTrainTest)

In [53]:
pd.set_option('display.max_columns',None)
combineTrainTest

Unnamed: 0,Age,Embarked,Fare,PassengerId,Pclass,Sex,Survived,FamSize,Title
0,22.000000,3,7.2500,1,3,1,0.0,1,12
1,38.000000,1,71.2833,2,1,0,1.0,1,13
2,26.000000,3,7.9250,3,3,0,1.0,0,9
3,35.000000,3,53.1000,4,1,0,1.0,1,13
4,35.000000,3,8.0500,5,3,1,0.0,0,12
5,32.252151,2,8.4583,6,3,1,0.0,0,12
6,54.000000,3,51.8625,7,1,1,0.0,0,12
7,2.000000,3,21.0750,8,3,1,0.0,4,8
8,27.000000,3,11.1333,9,3,0,1.0,2,13
9,14.000000,1,30.0708,10,2,0,1.0,1,13


In [54]:
train_Post = combineTrainTest[0:891]
test_Post = combineTrainTest[891:]

In [55]:
pd.set_option('display.max_columns',None)
train_Post.head()

Unnamed: 0,Age,Embarked,Fare,PassengerId,Pclass,Sex,Survived,FamSize,Title
0,22.0,3,7.25,1,3,1,0.0,1,12
1,38.0,1,71.2833,2,1,0,1.0,1,13
2,26.0,3,7.925,3,3,0,1.0,0,9
3,35.0,3,53.1,4,1,0,1.0,1,13
4,35.0,3,8.05,5,3,1,0.0,0,12


In [56]:
featuresList = list(train_Post.columns.values)
featuresList.remove('Survived')
featuresList.remove('PassengerId')

In [57]:
#split features and labels
features = list(train_Post[featuresList].values)
labels = list(train_Post['Survived'].values)

In [65]:
# Cross validation
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [59]:
# Naive Bayes
nb = GaussianNB()
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb), ("NB", nb)])
params = {'SKB__k':range(2,8)}
split = StratifiedShuffleSplit(n_splits = 10, test_size=0.25, random_state=42)

gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
    
gs.fit(features,labels)
clf = gs.best_estimator_

print clf
print gs.best_score_

Pipeline(steps=[('SKB', SelectKBest(k=2, score_func=<function f_classif at 0x000000000A520518>)), ('NB', GaussianNB(priors=None))])
0.784753363229


In [60]:
testFeatures = test_Post[featuresList].values
test['Survived'] = clf.predict(testFeatures).astype(int)

In [73]:
# dt = DecisionTreeClassifier(random_state = 1122)
# # skb = SelectKBest(f_classif)

# pipeline = Pipeline(steps=[#("SKB",skb),
#                            ("DT", dt)])
# params = {#'SKB__k':[5],
#          'DT__criterion':['gini'],
#          'DT__min_samples_split':range(2,40)}

# split = StratifiedShuffleSplit(n_splits = 100, test_size=0.25, random_state=42)

# gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
    
# gs.fit(features,labels)
# clf = gs.best_estimator_

# print clf
# print gs.best_score_

clf2 = DecisionTreeClassifier(min_samples_split = 25, random_state = 1122)
clf2.fit(features_train,labels_train)
pred2 = clf2.predict(features_test)

print accuracy_score(pred2,labels_test)
clf2.feature_importances_

0.816143497758


array([ 0.14571699,  0.01513729,  0.16932317,  0.17485343,  0.48641221,
        0.00330419,  0.00525273])

In [74]:
test['Survived'] = gs.predict(testFeatures).astype(int)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [63]:
ab = AdaBoostClassifier(random_state = 24)
dt = DecisionTreeClassifier()
skb = SelectKBest(f_classif)

pipeline = Pipeline(steps=[("SKB",skb), ("AB", ab)])
params = {'SKB__k':range(2,8),
         'AB__n_estimators':[150,1000]}

split = ShuffleSplit(n_splits = 10, test_size=0.25, random_state=42)
gs = GridSearchCV(pipeline, params, cv = split, scoring = 'accuracy')
   
gs.fit(features,labels)
clf = gs.best_estimator_

print clf
print gs.best_score_

KeyboardInterrupt: 

In [113]:
test['Survived'] = clf3.predict(testFeatures).astype(int)

In [64]:
clf4 = RandomForestClassifier(n_estimators = 1500, max_features = .5,
                              min_samples_split = 15, random_state = 24)
clf4.fit(features_train, labels_train)
pred4 = clf4.predict(features_test)

print accuracy_score(pred4, labels_test)

0.825112107623


In [140]:
test['Survived'] = clf4.predict(testFeatures).astype(int)

In [75]:
final = test[['PassengerId','Survived']]
final.to_csv('prediction.csv', index=False)