# Titanic

In [181]:
%matplotlib notebook

In [182]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [183]:
gendersub = pd.read_csv('gender_submission.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [184]:
gendersub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [185]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [186]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preparation of the training data set

In [187]:
data = train.copy()

print '**Null values:**'
print data.isnull().sum()

**Null values:**
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [188]:
titles = ['None', 'Mr.', 'Mrs.', 'Miss.', 'Dr.', 'Capt.', 'Mme.', 'Major', 'Master.', 'Mlle.', 'Sir.', 'Countess.', 'Col.']

def title(s, numeric):
    titlemap = dict()
    n = 2
    for x in titles:
        titlemap[x] = n
        n += 1
    titlemap['None'] = 1
    
    for x in titles:
        if x in s:
            if numeric:
                return titlemap[x]
            else:
                return x
    # No title
    if numeric:
        return titlemap['None']
    else:
        return 'None'

def titlenum(s):
    return title(s, numeric=True)

def titlestr(s):
    return title(s, numeric=False)
    
data['TitleNum'] = data['Name'].map(titlenum)
data['Title'] = data['Name'].map(titlestr)

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,TitleNum,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,3,Mr.
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,4,Mrs.
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,5,Miss.
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,4,Mrs.
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,3,Mr.


In [189]:
# Infer missing age from the title
known_age = data.loc[data['Age'].isnull() == False]
age = {x: None for x in titles}

for key in age:
    age[key] = known_age.loc[data['Title'] == key]['Age'].mean()
    if np.isnan(age[key]):
        age[key] = 999.

data.loc[data['Age'].isnull(), 'Age'] = data.loc[data['Age'].isnull(), 'Name'].map(lambda x: age[titlestr(x)])

In [190]:
data['Sex'] = [1 if x=='male' else 2 for x in data['Sex']]  # 1=male, 2=female
data['Embarked'] = [1 if x=='C' else 2 if x=='Q' else 3 for x in data['Embarked']] # 1=C, 2=Q, 3=S

def family(i):
    if i > 1:
        # Family
        return 2
    else:
        # Singleton
        return 1

data['Family'] = (data['SibSp'] + data['Parch']).map(family)

def deck(s):
    if pd.isnull(s):
        # Unknown deck
        return 1
    deckmap = {'G': 2, 'F': 3, 'D': 4, 'C': 5, 'B': 6, 'A': 7}
    for x in ['A', 'B', 'C', 'D', 'F', 'G']:
        if x in s:
            return deckmap[x]
    # Unknown deck
    return 1

data['Deck'] = data['Cabin'].map(deck)

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,TitleNum,Title,Family,Deck
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,3,3,Mr.,1,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,PC 17599,71.2833,C85,1,4,Mrs.,1,5
2,3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,STON/O2. 3101282,7.925,,3,5,Miss.,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,113803,53.1,C123,3,4,Mrs.,1,5
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,3,3,Mr.,1,1


## Linear regression

In [191]:
features = ['Pclass', 'Age', 'Sex', 'TitleNum', 'Family']

reg = linear_model.LinearRegression()
reg.fit(data[features], data['Survived'])

print('Coefficients:', reg.coef_)

data['Survived_LR'] = np.round(reg.predict(data[features])).astype(int)
data['LR_err'] = abs(data['Survived'] - data['Survived_LR'])
error = float(data['LR_err'].sum()) / len(data.index)

print('Accuracy on LP: {:.3f}%'.format(1-error))

scores = cross_val_score(reg, data[features], data['Survived'], cv=5)
print 'CV score = {:.3f} (+/- {:.3f}) (rounding not included)'.format(scores.mean(), scores.std())

('Coefficients:', array([-0.17408981, -0.00343592,  0.46494241,  0.0472306 , -0.08823355]))
Accuracy on LP: 0.807%
CV score = 0.393 (+/- 0.046) (rounding not included)


## Decision tree

In [217]:
features = ['Pclass', 'Age', 'Sex', 'TitleNum', 'Family']

dt = tree.DecisionTreeClassifier()
dt = dt.fit(data[features], data['Survived'])
data['Survived_DT'] = dt.predict(data[features])
data['DT_err'] = abs(data['Survived'] - data['Survived_DT'])
error = float(data['DT_err'].sum()) / len(data.index)

print('Accuracy on LP: {:.3f}%'.format(1-error))

with open('tree.dot', 'w') as f:
    f = tree.export_graphviz(dt, out_file=f, feature_names=features)
    
scores = cross_val_score(dt, data[features], data['Survived'], cv=5)
print 'CV score = {:.3f} (+/- {:.3f})'.format(scores.mean(), scores.std())
print dt.feature_importances_

Accuracy on LP: 0.903%
CV score = 0.812 (+/- 0.028)
[ 0.15483905  0.33080421  0.01838108  0.45381572  0.04215995]


## Random forest

In [193]:
features = ['Pclass', 'Age', 'Sex', 'TitleNum', 'Family']

rf = RandomForestClassifier()
rf = rf.fit(data[features], data['Survived'])
data['Survived_RF'] = rf.predict(data[features])
data['RF_err'] = abs(data['Survived'] - data['Survived_RF'])
error = float(data['RF_err'].sum()) / len(data.index)

print('Accuracy on LP: {:.3f}%'.format(1-error))
    
scores = cross_val_score(rf, data[features], data['Survived'], cv=5)
print 'CV score = {:.3f} (+/- {:.3f})'.format(scores.mean(), scores.std())
print rf.feature_importances_

Accuracy on LP: 0.898%
CV score = 0.812 (+/- 0.018)
[ 0.15081812  0.397742    0.18301194  0.23473667  0.03369127]


## Polynomial regression

In [194]:
features = ['Pclass', 'Age', 'Sex', 'TitleNum', 'Family']

pl = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression(fit_intercept=False))])
pl = pl.fit(data[features], data['Survived'])
data['Survived_PL'] = np.round(pl.predict(data[features])).astype(int)
data['PL_err'] = abs(data['Survived'] - data['Survived_PL'])
error = float(data['PL_err'].sum()) / len(data.index)

print('Accuracy on LP: {:.3f}%'.format(1-error))

scores = cross_val_score(pl, data[features], data['Survived'], cv=5)
print 'CV score = {:.3f} (+/- {:.3f}) (rounding not included)'.format(scores.mean(), scores.std())

Accuracy on LP: 0.823%
CV score = 0.410 (+/- 0.081) (rounding not included)


# Test data

In [195]:
#data['Survived'] = dt.predict(data[features])

In [196]:
#data[['PassengerId', 'Survived']].set_index('PassengerId').to_csv('submission.csv')