In [1]:
import numpy  as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import re

### Load data

In [9]:
data_train = pd.read_csv("data/train.csv")
data_test  = pd.read_csv("data/test.csv")
submission = pd.read_csv("data/gender_submission.csv")

data = [data_train, data_test]

data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Complete missing data

In [12]:
for dataset in data: 
    dataset['Age']      = dataset['Age'].replace(np.nan, dataset['Age'].mean(), regex=True)
    dataset['Fare']     = dataset['Fare'].replace(np.nan, dataset['Fare'].mean(), regex=True)
    dataset['Embarked'] = dataset['Embarked'].replace(np.nan, "M", regex=True)

### Add new data fields

In [16]:
for dataset in data:    
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

    dataset['IsAlone'] = 1 #initialize to yes/1 is alone
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1

    #quick and dirty code split title from name: http://www.pythonforbeginners.com/dictionary/python-split
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    
    #cleanup rare title names
    stat_min = 10
    title_names = (dataset['Title'].value_counts() < stat_min) #this will create a true false series with title name as index
    dataset['Title'] = dataset['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

data_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,0,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,1,Mr


### Remove some data fields

In [17]:
drop_columns = ['PassengerId', 'Name', 'Cabin', 'Ticket']

for dataset in data:    
    dataset.drop(drop_columns, axis=1, inplace = True)
    
data_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title
0,0,3,male,22.0,1,0,7.25,S,2,0,Mr
1,1,1,female,38.0,1,0,71.2833,C,2,0,Mrs
2,1,3,female,26.0,0,0,7.925,S,1,1,Miss
3,1,1,female,35.0,1,0,53.1,S,2,0,Mrs
4,0,3,male,35.0,0,0,8.05,S,1,1,Mr


### One hot encoding

In [50]:
Sex_binarized      = pd.DataFrame(preprocessing.LabelBinarizer().fit_transform(data_train.Sex))
# Embarked_binarized = pd.DataFrame(preprocessing.LabelBinarizer().fit_transform(data_train.Embarked))
# Titles_binarized   = pd.DataFrame(preprocessing.LabelBinarizer().fit_transform(data_train.Titles))
Pclass_onehot   = pd.DataFrame(preprocessing.LabelBinarizer().fit_transform(data_train.Pclass))

onehot_fields      = ['Sex', 'Embarked', 'IsAlone', 'Title'] #'Pclass'
normalize_fields   = ['SibSp', 'Parch', 'FamilySize', 'Age', 'Fare']

data_train_onehot    = pd.get_dummies(data_train[onehot_fields])
onehot_final_fields  = data_train_onehot.columns.tolist()

x_onehot = pd.concat([Pclass_onehot, data_train_onehot], axis=1)
y = data_train.Survived
x_onehot.head()

Unnamed: 0,0,1,2,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_M,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0
1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1
2,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0
3,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1
4,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0


### Normalization

In [51]:
# Create x, where x the 'scores' column's values as floats
x = data_train[['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
x_normalized = pd.DataFrame(x_scaled)

x = pd.concat([x_onehot, x_normalized], axis=1)
x.head()

Unnamed: 0,0,1,2,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_M,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs,0.1,1.1,2.1,3,4
0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0.271174,0.014151,0.125,0.0,0.1
1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0.472229,0.139136,0.125,0.0,0.1
2,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,0.321438,0.015469,0.0,0.0,0.0
3,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0.434531,0.103644,0.125,0.0,0.1
4,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0,0.434531,0.015713,0.0,0.0,0.0


### Model

In [53]:
model = MLPClassifier(activation = 'relu', solver='lbfgs', hidden_layer_sizes=(150), random_state=10)
model.fit(x, y)
pred = model.predict(x)

accuracy_score(pred, y)

0.8608305274971941