In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
test_path = 'data/test.csv'
train_path = 'data/train.csv'

## Explore

Read and inspect data.  Develop strategy for transforming features.

In [None]:
test = (pd.read_csv(test_path)
          .rename(columns=str.lower))
train = (pd.read_csv(train_path)
           .rename(columns=str.lower))

In [None]:
train.sample(10)

### Training Set Missing Values

In [None]:
train.isnull().sum() / train.shape[0]

### Test Set Missing Values

In [None]:
test.isnull().sum() / test.shape[0]

## Basic Feature Transformations

### Title

In [None]:
def get_title(name):
    ''' convert name to title group
    '''
    def title_map(title):
        ''' map less frequent titles to groups
        '''
        nobs = ['Countess.', 'Don.', 'Jonkheer.', 'Lady.', 'Sir.']
        miss = ['Mlle.', 'Mme.', 'Ms.']
        mltr = ['Capt.', 'Col.', 'Major.']
        if title in nobs:
            return 'Nobility'
        if title in miss:
            return 'Miss.'
        if title in mltr:
            return 'Military'
        return title
    try:
        return title_map(re.findall('\w+[.]', name)[0])
    except:
        return np.NaN

In [None]:
train['title'] = train.name.apply(get_title)

In [None]:
gb = train.groupby('title')
(gb['survived'].sum() / gb.size()).plot(kind='bar')

## Family Size

In [None]:
def family_size(df):
    ''' return number of family members accompanying
        passenger
    '''
    return df.sibsp + df.parch + 1


train['fam_size'] = family_size(train)

In [None]:
gb = train.groupby(train.fam_size > 1)
(gb['survived'].sum() / gb.size()).plot(kind='bar')

### Tangent - General Impute Function

In [None]:
def impute_by_title(df, column):
    ''' calculate the median for numeric value 'column' after grouping by title
        return impute function to replace np.nan with
        the corresponding title's median
    '''
    title_median = df.groupby('title')[column].median(skipna=True)
    def median_impute(row):
        if pd.isnull(row[column]):
            return title_median[row.title]
        return row[column]
    return median_impute

### Fare Category

In [None]:
def fare_category(df):
    ''' cut fare category into three slices
        test set contains NA values, hence the impute step
    '''
    fare_wo_na = df.apply(impute_by_title(df, 'fare'), axis=1)
    return pd.qcut(fare_wo_na, 4, labels=range(4))

train['fare_cat'] = pd.qcut(train.fare, 4, labels=range(4))

In [None]:
gb = train.groupby('fare_cat')
(gb['survived'].sum() / gb.size()).plot(kind='bar')

### Age Median Impute

In [None]:
train.age.plot(kind='hist')

In [None]:
train['age_imputed'] = train.apply(impute_by_title(train, 'age'), axis=1)

In [None]:
train.age_imputed.plot(kind='hist')

## Wrap All Transformations

In [None]:
def get_title(name):
    ''' convert name to title group
    '''
    def title_map(title):
        ''' map less frequent titles to groups
        '''
        nobs = ['Countess.', 'Don.', 'Dona.', 'Jonkheer.', 'Lady.', 'Sir.', 'Dr.']
        miss = ['Mlle.', 'Mme.', 'Ms.']
        mltr = ['Capt.', 'Col.', 'Major.']
        if title in nobs:
            return 'prestige'
        if title in miss:
            return 'miss'
        if title in mltr:
            return 'military'
        return title.lower().strip('.')
    try:
        return title_map(re.findall('\w+[.]', name)[0])
    except:
        return np.NaN

def family_size(df):
    ''' return number of family members accompanying
        passenger
    '''
    return df.sibsp + df.parch + 1


def impute_by_title(df, column):
    ''' calculate the median for numeric valued column
        after grouping by title
        return impute function to replace np.nan with
        the corresponding title's median
    '''
    title_median = df.groupby('title')[column].median(skipna=True)
    def median_impute(row):
        if pd.isnull(row[column]):
            return title_median[row.title]
        return row[column]
    return median_impute


def fare_category(df):
    ''' cut fare category into three slices
        test set contains NA values, hence the impute step
    '''
    fare_wo_na = df.apply(impute_by_title(df, 'fare'), axis=1)
    return pd.qcut(fare_wo_na, 4, labels=range(4))


def prep_features(df):
    ''' prep all features
    '''
    df = (df.rename(columns=str.lower)
            .assign(title=lambda df: df.name.apply(get_title))
            .assign(
                fam_size=lambda df: family_size(df),
                fare_cat=lambda df: fare_category(df),
                age_imputed= lambda df:(
                    df.apply(impute_by_title(df, 'age'), axis=1)
                ))
            .drop(['name', 'cabin', 'ticket', 'fare', 'sibsp', 'parch', 'age'], axis=1))
    return (df.drop(['sex', 'title', 'embarked'], axis=1)
              .join(pd.get_dummies(df[['sex', 'title', 'embarked']])))

## Test Transformations

In [None]:
train = pd.read_csv(train_path).pipe(prep_features)
test = pd.read_csv(test_path).pipe(prep_features)

In [None]:
train.isnull().sum()

In [None]:
train.sample(10)

In [None]:
test.isnull().sum()

In [None]:
test.sample(10)