In [2]:
import numpy as np
import pandas as pd

def feature_engineering_step1(_df):
    title_mapping = {
        'Capt': 'Mr',
        'Col': 'Mr',
        'Don': 'Mr',
        'Dr': 'Mr',
        'Jonkheer': 'Mr',
        'Lady': 'Mrs',
        'Major': 'Mr',
        'Master': 'Master',
        'Miss': 'Miss',
        'Mlle': 'Miss',
        'Mme': 'Mrs',
        'Mr': 'Mr',
        'Mrs': 'Mrs',
        'Ms': 'Miss',
        'Rev': 'Mr',
        'Sir': 'Mr',
        'the Countess': 'Mrs'
    }
    title_age_mapping = {
        'Capt': 'elder',
        'Col': 'elder',
        'Don': 'adult',
        'Dr': 'adult',
        'Jonkheer': 'adult',
        'Lady': 'elder',
        'Major': 'elder',
        'Master': 'young',
        'Miss': 'young',
        'Mlle': 'young',
        'Mme': 'adult',
        'Mr': 'adult',
        'Mrs': 'adult',
        'Ms': 'adult',
        'Rev': 'adult',
        'Sir': 'elder',
        'the Countess': 'adult'
    }
    cabin_mapping = {
        'A': 'M',
        'B': 'G',
        'C': 'M',
        'D': 'G',
        'E': 'G',
        'F': 'G',
        'G': 'M',
        'T': 'X',
        'X': 'X'
    }

    _df['Sex_'] = _df['Sex'].apply(lambda x: 1 if x=='female' else 0)

    _df['Title_'] = _df['Name'].apply(lambda x: x.replace('.',',').split(',')[1].strip())
    _df['FamilyName'] = _df['Name'].apply(lambda x: x.replace('.',',').split(',')[0].strip())

    #_df['Fare_'] = _df['Fare'].fillna(20)
    #_df['Fare_'] = _df['Fare_'].apply(lambda x: 40 if x > 40 else x)

    ####
    _df['Fare_'] = _df['Fare']
    _df.loc[ (_df.Fare.isnull())&(_df.Pclass==1),'Fare_'] =np.median(_df[_df['Pclass'] == 1]['Fare'].dropna())
    _df.loc[ (_df.Fare.isnull())&(_df.Pclass==2),'Fare_'] =np.median( _df[_df['Pclass'] == 2]['Fare'].dropna())
    _df.loc[ (_df.Fare.isnull())&(_df.Pclass==3),'Fare_'] = np.median(_df[_df['Pclass'] == 3]['Fare'].dropna())
    ####
    _df['Fare_'] = _df['Fare_'] / (1+_df['SibSp']+_df['Parch'])
    _df['HasFare'] = _df['Fare'].apply(lambda x: 0 if np.isnan(x) else 1)
    
    _df['Fare_b'] = np.digitize(_df['Fare_'], [0,5,10,20,30,40])
    
    # Family Size
    _df['FamilySize'] = (_df['SibSp'] + _df['Parch'])
    _df['HasFamily'] = (_df['SibSp'] + _df['Parch']).map(lambda x: 0 if x == 0 else 1)

    # Age
    _df['HasAge'] = _df['Age'].apply(lambda x: 0 if np.isnan(x) else 1)
    _df['Age_s'] = _df['Age'].apply(age_to_s)

    # or
    #_df['Age_'] = _df["Age"].fillna(_df["Age"].mean())
    # http://stackoverflow.com/questions/21050426/pandas-impute-nans
    
    # Title
    _df['Title_'] = _df['Name'].apply(lambda x: x.replace('.',',').split(',')[1].strip())
    _df.loc[(_df['Title_'].isnull()) & (_df['Sex']=='female'),('Title_')] = 'Miss'
    _df.loc[(_df['Title_'].isnull()) & (_df['Sex']=='male' ), ('Title_')] = 'Master'

    _df['Title_s'] = _df['Title_'].map(title_mapping)

    _df['Title_Age_s'] = _df['Title_'].map(title_age_mapping)
    _df['Title_Age_s'] = _df['Title_Age_s'].fillna('adult')
    
    ## fill age NAN:
    _df.loc[_df['HasAge']==0, ('Age_s')]= _df[_df['HasAge']==0]['Title_Age_s']

    # Cabin:
    _df['Cabin_'] = _df['Cabin'].apply(lambda x: 'X' if isinstance(x, float) else x[0])
    _df['Cabin_s'] = _df['Cabin_'].map(cabin_mapping)
    # NaN is no problem for get_dummies
    # However let's try to keep it as a feature called X

    # Embarked:
    _df['Embarked_'] = _df['Embarked'].apply(lambda x: 'S' if isinstance(x, float) else x)


    df_return = _df.loc[:,('Age','Age_s','HasAge', 'Sex','Pclass','Fare_', 'Fare_b','Title_s',
                     'Title_Age_s','Embarked_','Cabin_s', 'HasFamily', 'SibSp','Parch','FamilySize','FamilyName')]

    return df_return
#############################################
def age_to_s(x):
    if x<=16:
        return 'young'
    elif x>16 and x<=40:
        return'adult' 
    else:
        return'elder'

def feature_engineering(df_train, df_test):
    df_d_train = feature_engineering_step1(df_train)
    df_d_test = feature_engineering_step1(df_test)

    df_d_train_survivedFamily = df_d_train[ (df_train['Survived']==1) & (df_d_train['FamilySize']>0)]
    df_d_train_notSurvivedFamily = df_d_train[ (df_train['Survived']==0) & (df_d_train['FamilySize']>0)]
    #print df_d_train_survivedFamily
    survivedFamilyNames = df_d_train_survivedFamily['FamilyName']
    notSurvivedFamilyNames = df_d_train_notSurvivedFamily['FamilyName']
    
    df_d_train.loc[:,('FamilySurvived')] = df_d_train['FamilyName'].apply(lambda x: 1 if x in survivedFamilyNames.values else 0)
    df_d_test.loc[:,('FamilySurvived')] = df_d_test['FamilyName'].apply(lambda x: 1 if x in survivedFamilyNames.values else 0)
    
    df_d_train.loc[:,('FamilyDied')] = df_d_train['FamilyName'].apply(lambda x: 1 if x in notSurvivedFamilyNames.values else 0)
    df_d_test.loc[:,('FamilyDied')] = df_d_test['FamilyName'].apply(lambda x: 1 if x in notSurvivedFamilyNames.values else 0)
    
    del df_d_train['FamilyName']
    del df_d_test['FamilyName']

    return pd.get_dummies(df_d_train), pd.get_dummies(df_d_test)


In [3]:
import pylab as plt
%matplotlib inline
import numpy as np
import pandas as pd
import math

#from featureEngineering4 import feature_engineering

df=pd.read_csv('../data/train.csv', sep=',')
df_test=pd.read_csv('../data/test.csv', sep=',')

df_d, df_d_test = feature_engineering(df, df_test)

In [4]:
feature1=['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Age_s', 'Title_s', 'Title_Age_s', 'Cabin_s', 'FamilyName']
df[feature1]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Age_s,Title_s,Title_Age_s,Cabin_s,FamilyName
0,0,3,male,22,1,0,7.2500,adult,Mr,adult,X,Braund
1,1,1,female,38,1,0,71.2833,adult,Mrs,adult,M,Cumings
2,1,3,female,26,0,0,7.9250,adult,Miss,young,X,Heikkinen
3,1,1,female,35,1,0,53.1000,adult,Mrs,adult,M,Futrelle
4,0,3,male,35,0,0,8.0500,adult,Mr,adult,X,Allen
5,0,3,male,,0,0,8.4583,adult,Mr,adult,X,Moran
6,0,1,male,54,0,0,51.8625,elder,Mr,adult,G,McCarthy
7,0,3,male,2,3,1,21.0750,young,Master,young,X,Palsson
8,1,3,female,27,0,2,11.1333,adult,Mrs,adult,X,Johnson
9,1,2,female,14,1,0,30.0708,young,Mrs,adult,X,Nasser


In [5]:
df_d.head()

Unnamed: 0,Age,HasAge,Pclass,Fare_,Fare_b,HasFamily,SibSp,Parch,FamilySize,FamilySurvived,...,Title_s_Mrs,Title_Age_s_adult,Title_Age_s_elder,Title_Age_s_young,Embarked__C,Embarked__Q,Embarked__S,Cabin_s_G,Cabin_s_M,Cabin_s_X
0,22,1,3,3.625,1,1,1,0,1,0,...,0,1,0,0,0,0,1,0,0,1
1,38,1,1,35.64165,5,1,1,0,1,1,...,1,1,0,0,1,0,0,0,1,0
2,26,1,3,7.925,2,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
3,35,1,1,26.55,4,1,1,0,1,1,...,1,1,0,0,0,0,1,0,1,0
4,35,1,3,8.05,2,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1


In [6]:
df_d.columns

Index([u'Age', u'HasAge', u'Pclass', u'Fare_', u'Fare_b', u'HasFamily',
       u'SibSp', u'Parch', u'FamilySize', u'FamilySurvived', u'FamilyDied',
       u'Age_s_adult', u'Age_s_elder', u'Age_s_young', u'Sex_female',
       u'Sex_male', u'Title_s_Master', u'Title_s_Miss', u'Title_s_Mr',
       u'Title_s_Mrs', u'Title_Age_s_adult', u'Title_Age_s_elder',
       u'Title_Age_s_young', u'Embarked__C', u'Embarked__Q', u'Embarked__S',
       u'Cabin_s_G', u'Cabin_s_M', u'Cabin_s_X'],
      dtype='object')

In [7]:
feature2= ['Pclass', 'Fare_b', 'FamilySize', 'FamilySurvived','FamilyDied',
           'Sex_female', 'Sex_male',
           'Age_s_young', 'Age_s_adult', 'Age_s_elder', 
          'Title_Age_s_young','Title_Age_s_adult','Title_Age_s_elder',]

feature3= ['Pclass', 'Fare_b', 'FamilySize', 'FamilySurvived','FamilyDied',
           'Sex_female', 'Sex_male',
           'Age_s_young', 'Age_s_adult', 'Age_s_elder', 
          'Title_Age_s_young','Title_Age_s_adult','Title_Age_s_elder',
          'Embarked__C', 'Embarked__Q', 'Embarked__S', 
          'Cabin_s_G', 'Cabin_s_M', 'Cabin_s_X',]
df_d[feature2]

Unnamed: 0,Pclass,Fare_b,FamilySize,FamilySurvived,FamilyDied,Sex_female,Sex_male,Age_s_young,Age_s_adult,Age_s_elder,Title_Age_s_young,Title_Age_s_adult,Title_Age_s_elder
0,3,1,1,0,1,0,1,0,1,0,0,1,0
1,1,5,1,1,0,1,0,0,1,0,0,1,0
2,3,2,0,0,0,1,0,0,1,0,1,0,0
3,1,4,1,1,1,1,0,0,1,0,0,1,0
4,3,2,0,0,0,0,1,0,1,0,0,1,0
5,3,2,0,1,1,0,1,0,1,0,0,1,0
6,1,6,0,0,0,0,1,0,0,1,0,1,0
7,3,1,4,0,1,0,1,1,0,0,1,0,0
8,3,1,2,1,0,1,0,0,1,0,0,1,0
9,2,3,1,1,1,1,0,1,0,0,0,1,0


In [None]:
df_d.describe()

In [None]:
df.describe()