In [None]:
%matplotlib notebook

import os
import re
import pandas as pd
import numpy as np
import scipy
from sklearn import linear_model
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Median age based on title
# =========================
full = train.append(test)

# Extract titles from names and assign numerical values
titles = set()
names = full['Name'].values
p = re.compile(r'[.,]')
for n in names:
    if '.' in n:
        s = p.split(n)
        s = [x.strip() + '.' for x in s]
        titles.add(s[1])

title_num = {t: n for t, n in zip(titles, range(len(titles)))}
        
def name_to_title(name):
    for t in titles:
        if t in name:
            return t
    return 'unknown'
        
full['Title'] = full['Name'].apply(name_to_title).astype('category')
title_vs_age = full.groupby('Title').median()['Age'].to_dict()

# Median fare based on Pclass
# ===========================
pclass_to_fare = full.groupby('Pclass').mean()['Fare'].to_dict()

# Preprocess data frame
# =====================
def preprocess(df):
    # Change sex to discrete numerical value (1 = male, 2 = female)
    df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 2)

    # Change embarkment location to discrete numerical (C = 1, Q = 2, S = 3, unkown = 0)
    mapping = {np.nan: 0, 'C': 1, 'Q': 2, 'S': 3}
    df['Embarked'] = df['Embarked'].apply(lambda x: mapping[x])

    # Change names to numeric values representing titles
    def name_to_title(name):
        for t in titles:
            if t in name:
                return t
        return 'unknown'

    df['Title'] = df['Name'].apply(name_to_title)
    df = df.drop('Name', axis=1)

    # Drop ticket column
    df = df.drop('Ticket', axis=1)
    
    # Drop cabin column (too many NaNs)
    df = df.drop('Cabin', axis=1)
    
    # Predict NaN in Age
    complete = df.dropna()
    incomplete = df[df.isnull().any(axis=1)]
    incomplete.loc[incomplete['Age'].isnull(), 'Age'] = incomplete['Title'].apply(lambda x: title_vs_age[x])
    
    # Predict NaN in Fare
    incomplete.loc[incomplete['Fare'].isnull(), 'Fare'] = incomplete['Pclass'].apply(lambda x: pclass_to_fare[x])
    
    # Merge
    df = complete.append(incomplete)
    
    # Create family column and delete SibSp and Parch
    df['Family'] = df['SibSp'] + df['Parch']
    df = df.drop(['SibSp', 'Parch'], axis=1)
    
    # Change title to number
    df['Title'] = df['Title'].apply(lambda x: title_num[x])
    
    return df

train = preprocess(train)
train.head()

In [None]:
# Prepare X, Y
X = train.drop(['PassengerId', 'Survived'], axis=1)
Y = train['Survived']

# Decision tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)
train['Predict'] = clf.predict(X)

In [None]:
# Check on learning data
check = train[['Survived', 'Predict']]
train['OK'] = (train['Survived'] == train['Predict']).astype(int)
ok = float(train['OK'].value_counts()[1])
missed = float(train['OK'].value_counts()[0])

print 'Percent misclassified: {:.2f}%'.format(100 * missed / (ok + missed))

In [None]:
# Test
X = preprocess(test).drop(['PassengerId'], axis=1)
test['Survived'] = clf.predict(X)
test[['PassengerId', 'Survived']].to_csv('submission.csv', index=False)

**TODO:**

- add cross-validation
- compare different classifiers