# Data analysis libraries 

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split

from patsy import dmatrices

# Data munging

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5 KB


Some null values are found in "Age", "Cabin", "Embarked". In particular, most of the "Cabin" column are null. We thus drop "Cabin" here as well as the "Ticket" column that seems not helpful.

In [4]:
df = df.drop(['Cabin','Ticket'],axis=1)

As pointed out by some kagglers, the titles in the "Name" column can be used to imputate the missing ages. By counting words in this column, most of the passengers turned out to have the title "Mr", "Mrs", "Miss, or "Master". Below we extract them and create a "Title" column. 

In [5]:
def createTitle(df):
    n = len(df)
    title = []

    for i in xrange(n):
    
        s = df.Name[i]
        sex = df.Sex[i]

        if s.find('Mrs') > 0:  # must be before Mr
            title.append('Mrs')

        elif s.find('Mr') > 0:
            title.append('Mr')

        elif s.find('Miss') > 0 or s.find('Ms') > 0 or s.find('Mlle') > 0:
            title.append('Miss')
        
        elif s.find('Master') > 0:
            title.append('Master')

        elif sex=='male':
            title.append('Mr')
        
        else:
            title.append('Ms')   
        
        df["Title"] = pd.Series(title)

    return df

In [6]:
df = createTitle(df)

Now that we have the titles which should be relevant to the passenger's age, the missing ages can be imputated based on this information. Here, we will use the average age for each title. Also, there are two missing values in the "Embarked" column. Here we simply use "S" that is the most common value. 

In [7]:
# useful boolean Series
male = df.Sex=='male'
female = df.Sex=='female'

Mr = df.Title=='Mr'
Miss = df.Title=='Miss'
Mrs = df.Title=='Mrs'
Master = df.Title=='Master'

In [8]:
mean_age_miss = np.average( df[Miss]['Age'].dropna() )
mean_age_mrs = np.average( df[Mrs]['Age'].dropna() )
mean_age_mr = np.average( df[Mr]['Age'].dropna() )
mean_age_master = np.average( df[Master]['Age'].dropna() )

In [9]:
mean_age = mean_age_mr, mean_age_master, mean_age_mrs, mean_age_miss

In [10]:
def impute(df, mean_age):
    
    Mr = df.Title=='Mr'
    Miss = df.Title=='Miss'
    Mrs = df.Title=='Mrs'
    Master = df.Title=='Master'    

    df['AgeFill'] = df['Age']

    age_null = df.Age.isnull()
    
    df.loc[ age_null & Mr ,'AgeFill'] = mean_age[0]
    df.loc[ age_null & Master ,'AgeFill'] = mean_age[1]
    df.loc[ age_null & Mrs ,'AgeFill'] = mean_age[2]
    df.loc[ age_null & Miss ,'AgeFill'] = mean_age[3]

    df.loc[ df.Embarked.isnull(), 'Embarked'] = df.Embarked.mode()[0]  #'S'
    df.loc[ df.Fare.isnull(), 'Fare' ] = df.Fare.mean()
        
    return df

In [11]:
df = impute(df, mean_age)

# Logistic regression and SVM

We first try L2-regularized logit and SVM by including all available information from the data. But note further feature engineering is possible and it may improve the performance.

In [12]:
formula = 'Survived ~ C(Pclass) + C(Sex) + AgeFill + SibSp + Parch + Fare + C(Embarked) + C(Title)'

In [13]:
y_df, x_df = dmatrices(formula, data=df, return_type='dataframe')

In [14]:
# to numpy array
y = y_df.values.ravel()
x = x_df.values

In [15]:
x_train, x_val, y_train, y_val = train_test_split(x,y)

In [16]:
# hyperparameters which should be determined by cross-validation

# for logit
d1 = {'C':[0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
     ,'penalty': ['l1','l2']}
param_grid1 = [d1]

# for SVM
d2 = {'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
     ,'gamma': [0, 1, 3, 10, 30]}
param_grid2 = [d2]

In [17]:
clf_grid1 = GridSearchCV(LogisticRegression(), param_grid1, cv=10)
clf_grid2 = GridSearchCV(SVC(), param_grid2, cv=10)

In [18]:
clf_grid1.fit(x, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'penalty': ['l1', 'l2'], 'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [19]:
clf_grid2.fit(x, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'gamma': [0, 1, 3, 10, 30]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [20]:
# best hyporparameters
clf_grid1.best_params_, clf_grid2.best_params_

({'C': 1, 'penalty': 'l1'}, {'C': 1, 'gamma': 0})

In [21]:
# classification accuracy
clf_grid1.score(x,y), clf_grid2.score(x,y)

(0.83277216610549942, 0.88552188552188549)

# Cleaning the test set

In [22]:
df_test = pd.read_csv("test.csv")
df_test = df_test.drop(["Ticket", "Cabin"], axis=1)
df_test['Survived'] = 2

In [23]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 10 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           417 non-null float64
Embarked       418 non-null object
Survived       418 non-null int64
dtypes: float64(2), int64(5), object(3)
memory usage: 35.9 KB


As for the training set, after adding a "Title" column, we make an imputation for "Age" in terms of mean_age which determined from the training set. For the missing value in the "Fare" column, we simply use the mean for imputation.

In [24]:
df_test = createTitle(df_test)

In [25]:
df_test = impute(df_test, mean_age)

In [26]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Embarked       418 non-null object
Survived       418 non-null int64
Title          418 non-null object
AgeFill        418 non-null float64
dtypes: float64(3), int64(5), object(4)
memory usage: 42.5 KB


# Make predictions on the test set

In [27]:
y_test, x_test = dmatrices(formula, data=df_test, return_type='dataframe')
x_test = x_test.values

In [28]:
y_pred_logit = clf_grid1.predict(x_test).astype(int)
y_pred_svm = clf_grid2.predict(x_test).astype(int)

In [29]:
# logit
df_test.Survived = pd.Series(y_pred_logit)
result = df_test[['PassengerId','Survived']]
result.to_csv('logit.csv', index=False)

In [30]:
# SVM
df_test.Survived = pd.Series(y_pred_svm)
result = df_test[['PassengerId','Survived']]
result.to_csv('svm.csv', index=False)