In [1]:
import numpy as np
import pandas as pd

In [2]:
def process_titanic_train():
    
    df = pd.read_csv('titanic_train.csv')
    
    fill_dict = {}
    
    df.drop(columns=['PassengerId'], inplace=True)
    Pclass_fill = df['Pclass'].mode()[0]
    df['Pclass'].fillna(Pclass_fill, inplace=True)
    fill_dict['Pclass'] = Pclass_fill
    
    Name_fill = "no_name. is missing value"
    df['Name'].fillna(Name_fill, inplace=True)
    fill_dict['Name'] = Name_fill
    
    df['title'] = df['Name'].apply(lambda x: x.split('.')[0].split()[-1].strip().lower())
    common_titles = ['mr','mrs','miss','master']
    df['title'] = df['title'].apply(lambda x: x if x in common_titles else 'special_title')
    df.drop(columns=['Name'], inplace=True)
    
    def fill_sex(x):
        if pd.isnull(x['Sex']):
            if x['title'] in ['mrs','miss']:
                return 'female'
            else:
                return 'male'
        else:
            return x['Sex']
    
    df['Sex'] = df.apply(lambda x: fill_sex(x), axis=1)
    
    age_fill = df.groupby(['Pclass','Sex']).mean()['Age']
    df['Age'] = df.apply(lambda x: age_fill[x['Pclass']][x['Sex']] if pd.isnull(x['Age']) else x['Age'], axis=1)
    fill_dict['Age'] = age_fill
    
    SibSp_fill = df['SibSp'].mode()[0]
    df['SibSp'].fillna(SibSp_fill, inplace=True)
    fill_dict['SibSp'] = SibSp_fill
    
    Parch_fill = df['Parch'].mode()[0]
    df['Parch'].fillna(Parch_fill, inplace=True)
    fill_dict['Parch'] = Parch_fill
    
    df['Ticket'].fillna("9", inplace=True)
    
    fare_fill = df.groupby(['Pclass','Sex']).mean()['Fare']
    df['Fare'] = df.apply(lambda x: fare_fill[x['Pclass']][x['Sex']] if pd.isnull(x['Fare']) else x['Fare'], axis=1)
    fill_dict['Fare'] = fare_fill
    
    df['Cabin'] = df['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
    
    embarked_fill = df['Embarked'].mode()[0]
    df['Embarked'].fillna(embarked_fill, inplace=True)
    fill_dict['Embarked'] = embarked_fill
    
    df['age_group'] = pd.cut(df['Age'], bins=[0,5,14,60,100], labels=['infant','child','adult','old'])
    df['family_size'] = df['SibSp'] + df['Parch']
    df['ticket_feat'] = df['Ticket'].apply(lambda x: 'other_num' if x[0] in ['4','5','6','7','8','9'] else x[0])
    df.drop(columns=['Ticket'], inplace=True)
    df['fare_tier'] = pd.cut(df['Fare'], bins=[0,50,1000], labels=['economical','expensive'], include_lowest=True)
    
    df['Fare'] = df['Fare'].apply(np.log)
    
    return df, fill_dict

In [4]:
df, fill_dict = process_titanic_train()

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,age_group,family_size,ticket_feat,fare_tier
0,0,3,male,22.0,1,0,1.981001,0,S,mr,adult,1,A,economical
1,1,1,female,38.0,1,0,4.266662,1,C,mrs,adult,1,P,expensive
2,1,3,female,26.0,0,0,2.070022,0,S,miss,adult,0,S,economical
3,1,1,female,35.0,1,0,3.972177,1,S,mrs,adult,1,1,expensive
4,0,3,male,35.0,0,0,2.085672,0,S,mr,adult,0,3,economical


In [6]:
df.isnull().sum()

Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Cabin          0
Embarked       0
title          0
age_group      0
family_size    0
ticket_feat    0
fare_tier      0
dtype: int64