In [231]:
import pandas as pd
import numpy as np
import operator
import pickle
from collections import Counter,defaultdict
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.feature_selection import SelectKBest
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.model_selection import train_test_split

In [373]:
# Read the training data
df = pd.read_csv('train.csv')

# fill nan and maps categorical string values with integers
df['Sex'] = df['Sex'].map({'male':1,'female':0})
df['Embarked'] = df['Embarked'].fillna('S')
df['Cabin'] = df['Cabin'].fillna(0)
df.loc[df['Cabin']!=0,'Cabin']=1
df['Embarked'] = df['Embarked'].map({'S':1,'C':2,'Q':3})

# Dropping the columns that seemd unusable
df = df.drop(['PassengerId','Ticket'],axis=1)
df.head(6)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,1,2
2,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1,1,1
4,0,3,"Allen, Mr. William Henry",1,35.0,0,0,8.05,0,1
5,0,3,"Moran, Mr. James",1,,0,0,8.4583,0,3


In [374]:
df['Fare'].describe()   

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [375]:
# Converting Fare into 4 categories based on quantiles
quantile=0.0
for i in range(4):
    df['Fare'][operator.and_(df['Fare'] > df['Fare'].quantile(quantile),df['Fare'] <= df['Fare'].quantile(quantile+0.25))] = i
    quantile += .25
df.head(6)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,0.0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,3.0,1,2
2,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,1.0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,3.0,1,1
4,0,3,"Allen, Mr. William Henry",1,35.0,0,0,1.0,0,1
5,0,3,"Moran, Mr. James",1,,0,0,1.0,0,3


Here we can see the number of titles among the passengers

In [379]:
# most common titles 
Counter(list(df.Name.str.split('\s+').str[1])).most_common(25)

[('Mr.', 502),
 ('Miss.', 179),
 ('Mrs.', 121),
 ('Master.', 40),
 ('Dr.', 7),
 ('Rev.', 6),
 ('y', 4),
 ('Planke,', 3),
 ('Impe,', 3),
 ('Mlle.', 2),
 ('Gordon,', 2),
 ('Col.', 2),
 ('Major.', 2),
 ('Carlo,', 1),
 ('Shawah,', 1),
 ('Walle,', 1),
 ('Melkebeke,', 1),
 ('Cruyssen,', 1),
 ('Messemaeker,', 1),
 ('Capt.', 1),
 ('Pelsmaeker,', 1),
 ('Steen,', 1),
 ('Mulder,', 1),
 ('Ms.', 1),
 ('Billiard,', 1)]

In [377]:
# A mapping is created using defaultdict to their title
x = Counter(list(df.Name.str.split('\s+').str[1])).most_common(5)
title_map = defaultdict(lambda: 0)
for i in range(5):
    title_map[x[i][0]]=i+1
title_map['Ms.'] = title_map['Miss.']
title_map

defaultdict(<function __main__.<lambda>>,
            {'Dr.': 5,
             'Master.': 4,
             'Miss.': 2,
             'Mr.': 1,
             'Mrs.': 3,
             'Ms.': 2})

In [124]:
df['Name'] = [title_map[i] for i in df.Name.str.split(' ').str[1]]
df.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,1,22.0,1,0,0.0,0,1
1,1,1,3,0,38.0,1,0,3.0,1,2
2,1,3,2,0,26.0,0,0,1.0,0,1
3,1,1,3,0,35.0,1,0,3.0,1,1
4,0,3,1,1,35.0,0,0,1.0,0,1


In [125]:
# Fill in the NaN values in age, we create a copy of ages without Nan

copy_df = df.dropna()
print(copy_df['Age'].describe())
# Split age into 8 categories 

for i in range(8):
    copy_df['Age'][operator.and_(copy_df['Age'] > i*10,copy_df['Age'] <= (i+1)*10)]=i+1
    
copy_df.head(10)

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,1,3.0,1,0,0.0,0,1
1,1,1,3,0,4.0,1,0,3.0,1,2
2,1,3,2,0,3.0,0,0,1.0,0,1
3,1,1,3,0,4.0,1,0,3.0,1,1
4,0,3,1,1,4.0,0,0,1.0,0,1
6,0,1,1,1,6.0,0,0,3.0,1,1
7,0,3,4,1,1.0,3,1,2.0,0,1
8,1,3,3,0,3.0,0,2,1.0,0,1
9,1,2,3,0,2.0,1,0,2.0,0,2
10,1,3,2,0,1.0,1,1,2.0,1,1


In [126]:
age = copy_df['Age']
copy_df = copy_df.drop(['Age','Survived'],axis=1)
features = copy_df.values

In [127]:
train_data,test_data,train_labels,test_labels = train_test_split(features, age, test_size=0.3,random_state=42)

In [128]:
selector = SelectKBest(k =8)
selector.fit(train_data, train_labels)
for i in range(len(selector.scores_)):
    print(copy_df.columns[i],selector.scores_[i])

('Pclass', 10.695080629503366)
('Name', 15.303207873636534)
('Sex', 1.4036332040446005)
('SibSp', 20.921389952717)
('Parch', 11.42309979067489)
('Fare', 7.703273351749964)
('Cabin', 5.601273139844905)
('Embarked', 0.9007802354083679)


In [129]:
# We can see that the attributes Sex and Embarked does not provide much information to predict age class.Hence we can remove those

copy_df = copy_df.drop(['Sex','Embarked'],axis=1)
features = copy_df.values
features.shape

(714L, 6L)

In [130]:
train_data,test_data,train_labels,test_labels = train_test_split(features, age, test_size=0.33,random_state=12)

In [371]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(features, age)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [132]:
# As we have trained our models lets fill nan values in the original DataFrame

nan_df = df[df['Age'].isnull()]
nan_df = nan_df.drop(['Survived','Sex','Age','Embarked'],axis=1)
pred_features = nan_df.values
pred_features.shape

(177L, 6L)

In [133]:
pred_ages = knn.predict(pred_features)
df.loc[df['Age'].isnull(),'Age'] = pred_ages
df.isnull().any()

Survived    False
Pclass      False
Name        False
Sex         False
Age         False
SibSp       False
Parch       False
Fare        False
Cabin       False
Embarked    False
dtype: bool

In [134]:
# Now we can see that there are no more Nan values left in the dataframe.
# Let's start working on a classifier to predict Survived class

labels = df['Survived']
df = df.drop('Survived',axis=1)

In [135]:
# Lets check the for best features 
train_data,test_data,train_labels,test_labels = train_test_split(df.values, labels, test_size=0.33,random_state=101)
kbest = SelectKBest()
selector.fit(train_data, train_labels)
for i in range(len(selector.scores_)):
    print(df.columns[i],selector.scores_[i])

('Pclass', 71.94731623934742)
('Name', 127.21734125738863)
('Sex', 243.34788418660605)
('Age', 0.3914049963634997)
('SibSp', 0.5666767015100321)
('Parch', 3.60043133934634)
('Fare', 56.74229470330746)
('Cabin', 70.55417791029136)
('Embarked', 7.861748766383377)


In [136]:
# Keep only highly scored features
#final_df = df
final_df = df.drop(['Age','SibSp'],axis=1)
train_data,test_data,train_labels,test_labels = train_test_split(df.values, labels, test_size=0.33,random_state=101)

In [137]:
list_of_classifiers = [DecisionTreeClassifier(),KNeighborsClassifier(),SVC(),MLPClassifier(),
                       GaussianNB(),BernoulliNB(),RandomForestClassifier(),AdaBoostClassifier()]
scores = []

In [138]:
for classifier in list_of_classifiers:
    clf = classifier
    clf.fit(train_data, train_labels)
    scores.append(clf.score(test_data, test_labels))
for i in range(len(list_of_classifiers)):
    print(list_of_classifiers[i],scores[i])
    print('\n')

(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), 0.7728813559322034)


(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), 0.7796610169491526)


(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 0.8067796610169492)


(MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='con

In [101]:
# Classifiers that gave most accuracies are Adaboost,GaussianNB,MLPClassifier and SVC, Lets tune them
nb = GaussianNB()
nb.fit(train_data,train_labels)
nb.score(test_data,test_labels)

0.8

In [102]:
svm = SVC(kernel='poly',degree=2)
svm.fit(train_data,train_labels)
svm.score(test_data,test_labels)

0.823728813559322

In [216]:
ada = AdaBoostClassifier(n_estimators=200,learning_rate=0.13)
ada.fit(train_data,train_labels)
ada.score(test_data,test_labels)

0.823728813559322

In [366]:
mlp=MLPClassifier(hidden_layer_sizes =(220,130),learning_rate_init=0.1,max_iter=20000)
mlp.fit(train_data,train_labels)
mlp.score(test_data,test_labels)

0.823728813559322

In [378]:
# change defaultdict to normal dict as the lambda function cannot be pickled

titles = dict()
for key,value in title_map.items():
    titles[key] = value
    
# Save those three model as they give the same accuracy after tuning

with open('mlp.pkl','wb') as f:
    pickle.dump(mlp,f)
with open('ada.pkl','wb') as f:
    pickle.dump(ada,f)
with open('svm.pkl','wb') as f:
    pickle.dump(svm,f)
with open('title_map.pkl','wb') as f:
    pickle.dump(titles,f)