In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pylab inline


Populating the interactive namespace from numpy and matplotlib


In [2]:
# data preparation
import re
import operator

In [3]:
# modelling
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

In [4]:
titanic = pd.read_csv("c:/Users/c147141/Downloads/train.csv")


# Data Preparation

### Missing Data

In [5]:
titanic['Age'] = titanic['Age'].fillna(titanic.Age.median())

titanic.loc[titanic.Sex == 'male','Sex'] = 0
titanic.loc[titanic.Sex == 'female','Sex'] = 1

titanic['Embarked'] = titanic['Embarked'].fillna('S')
titanic.loc[titanic.Embarked == 'S', 'Embarked'] = 0
titanic.loc[titanic.Embarked == 'C', 'Embarked'] = 1
titanic.loc[titanic.Embarked == 'Q', 'Embarked'] = 2

### Generating extra features
First we generate the features generated from the tutorials.

In [6]:
## Name length
titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x))


## Family Size
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']


## Find Title
def get_titel(name):
    titles_search = re.search(' ([A-Za-z]+)\.', name)
    if titles_search:
        return titles_search.group(1)
    return ""
titles = titanic['Name'].apply(get_titel)
#match title to a certain class
title_mapping = {'Mr':1,"Miss":2,'Ms':2,'Mrs':3,'Master':4,'Dr':5,'Rev':6,
                 'Major':7,'Col':7,'Capt':7,'Mlle':8,"Mme":8,'Don':9,
                 'Sir':9,'Lady':10,'Countess':10,'Jonkheer':10}
for k,v in title_mapping.items():
    titles[titles == k] = v
titanic["Title"] = titles


## Find FamilyId
family_id_mapping = {}
def get_family_id(row):
    last_name = row['Name'].split(',')[0]
    family_id = '{0}{1}'.format(last_name,row['FamilySize'])
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0 :
            current_id = 1
        else: 
            current_id = max(family_id_mapping.items(),
                             key = operator.itemgetter(1))[1]+1
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]

family_ids = titanic.apply(get_family_id,axis = 1)
family_ids[titanic['FamilySize'] < 3] = -1
titanic['FamilyId'] = family_ids

Next we are going to generate some features by ourself.

In [7]:
# Use the Cabin Information.

# put the missing class to class 'Z'
titanic['Cabin'] = titanic['Cabin'].fillna('Z')
cabin_class = titanic['Cabin'].apply(lambda x: x[0])
#print (pd.value_counts(cabin_class))
# put the T class also to A because it is only one and also belong to first class
cabin_class[cabin_class == 'T'] = 'A'
# print (pd.value_counts(cabin_class))
# match the class to numbers
cabin_mapping = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'Z':8}
for k, v in cabin_mapping.items():
    cabin_class[cabin_class == k] = v

#print (pd.value_counts(cabin_class))
titanic['CabinClass'] = cabin_class

# Modelling
### Random Forest

In [8]:
# define predictors to use in modelling
predictors = ['Pclass','Sex','Age','FamilySize','Fare','Embarked',
              'NameLength','Title','FamilyId','CabinClass']

alg = RandomForestClassifier(random_state=1, n_estimators= 150, min_samples_leaf= 4,
                            min_samples_split = 8)

scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic['Survived'],cv = 3)
print (scores.mean())

0.826038159371


### Gradient Boosting

In [9]:
alg = GradientBoostingClassifier(random_state = 1, max_depth=3, n_estimators=25)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic['Survived'],cv=3)
print(scores.mean())

0.826038159371


### Logistic Regression

In [10]:
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic['Survived'],cv=3)
print(scores.mean())

0.809203142536


### Ensembling models

In [11]:
# Ensembling Random Forest with Gradient Boosting and logistic regression
algorithms = [
    [GradientBoostingClassifier(random_state=1,n_estimators=25,max_depth=3),
    ['Pclass','Sex','Age','FamilySize','Fare','Embarked',
              'NameLength','Title','FamilyId','CabinClass']],
    [RandomForestClassifier(random_state = 1,n_estimators=150,min_samples_leaf=4,
                            min_samples_split=8),
    ['Pclass','Sex','Age','FamilySize','Fare','Embarked',
              'NameLength','Title','FamilyId','CabinClass']],
    [LogisticRegression(random_state = 1),
    ['Pclass','Sex','Age','FamilySize','Fare','Embarked','NameLength','Title','CabinClass']]
]

kf = KFold(titanic.shape[0],n_folds=3,random_state = 1)

In [12]:
print numpy.arange(0.4, 0.9, 0.05)

[ 0.4   0.45  0.5   0.55  0.6   0.65  0.7   0.75  0.8   0.85]


In [13]:
# ensemble two models together. 

# tried from 0.4 to 0.85, 0.5 should be the most stable one
cut = 0.4
def criteria_cut(row):
    if sum(row)>= 2:
        return 1
    else:
        return 0

predictions = []
for train, test in kf:
    train_target = titanic['Survived'].iloc[train]
    full_test_predictions = []
    for alg, predictors in algorithms:
        alg.fit(titanic[predictors].iloc[train,:],train_target)
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:])[:,1]
        full_test_predictions.append(test_predictions)
    
    full_test_pred = pd.DataFrame(full_test_predictions).T > cut
    #print full_test_pred
    test_predictions = full_test_pred.apply(criteria_cut,
                              axis = 1)
    #print (test_predictions)
    #print sum(full_test_pred[0]+full_test_pred[1]+full_test_pred[2])
    #test_predictions = (full_test_predictions[0] + full_test_predictions[1]+
    #                   full_test_predictions[2])/3
    #test_predictions[test_predictions <= cut] = 0
    #test_predictions[test_predictions > cut] = 1
    predictions.append(test_predictions)
predictions = np.concatenate(predictions, axis = 0)
# compute accuracy by comparing to the training data

predictions = pd.Series(predictions)
accuracy = sum([predictions == titanic['Survived']])/float(len(predictions))
print accuracy

0.829405162738


# Generate Submmision Files

In [31]:
titanic_test = pd.read_csv("c:/Users/c147141/Downloads/test.csv")

In [32]:
# prepare the test data
titanic_test['Age'] = titanic_test['Age'].fillna(titanic.Age.median())


titanic_test.loc[titanic_test.Sex == 'male','Sex'] = 0
titanic_test.loc[titanic_test.Sex == 'female','Sex'] = 1


titanic_test['Embarked'] = titanic_test['Embarked'].fillna('S')
titanic_test.loc[titanic_test.Embarked == 'S', 'Embarked'] = 0
titanic_test.loc[titanic_test.Embarked == 'C', 'Embarked'] = 1
titanic_test.loc[titanic_test.Embarked == 'Q', 'Embarked'] = 2


titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median())


titanic_test['NameLength'] = titanic_test['Name'].apply(lambda x: len(x))


titanic_test['FamilySize'] = titanic_test['SibSp'] + titanic_test['Parch']


titles_test = titanic_test['Name'].apply(get_titel)
# print pd.value_counts(titles_test)
title_mapping = {'Mr':1,"Miss":2,'Ms':2,'Mrs':3,'Master':4,'Dr':5,'Rev':6,
                 'Major':7,'Col':7,'Capt':7,'Mlle':8,"Mme":8,'Don':9,'Dona':9,
                 'Sir':9,'Lady':10,'Countess':10,'Jonkheer':10}
for k,v in title_mapping.items():
    titles_test[titles_test == k] = v
titanic_test["Title"] = titles_test


family_ids = titanic_test.apply(get_family_id,axis = 1)
family_ids[titanic_test['FamilySize'] < 3] = -1
titanic_test['FamilyId'] = family_ids


titanic_test['Cabin'] = titanic_test['Cabin'].fillna('Z')
cabin_class = titanic_test['Cabin'].apply(lambda x: x[0])
cabin_class[cabin_class == 'T'] = 'A'
cabin_mapping = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'Z':8}
for k, v in cabin_mapping.items():
    cabin_class[cabin_class == k] = v
titanic_test['CabinClass'] = cabin_class

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dona        1
Ms          1
Dr          1
Name: Name, dtype: int64


In [36]:
# predict using the model
cut = 0.4
def criteria_cut(row):
    if sum(row)>= 2:
        return 1
    else:
        return 0

full_predictions = []
for alg, predictors in algorithms:
    alg.fit(titanic[predictors],titanic['Survived'])
    predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1]
    full_predictions.append(predictions)

full_pred = pd.DataFrame(full_predictions).T > cut

predictions = full_pred.apply(criteria_cut, axis = 1)


In [37]:
# generate submission file
submission = pd.DataFrame({
        'PassengerId':titanic_test['PassengerId'],
        'Survived':predictions
    })

submission.to_csv("c:/Users/c147141/Downloads/kaggle.csv",index = False)