In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

import string
import warnings
warnings.filterwarnings('ignore')

In [None]:
def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set on axis 0
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

def divide_df(all_data):
    # Returns divided dfs of training and test set
    return all_data.loc[:890], all_data.loc[891:].drop(['Survived'], axis=1)

df_train = pd.read_csv('input/train.csv')
df_test = pd.read_csv('input/test.csv')
df_all = concat_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

dfs = [df_train, df_test]

In [None]:
def display_missing(df):    
    for col in df.columns.tolist():          
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')
    
for df in dfs:
    print('{}'.format(df.name))
    display_missing(df)

In [None]:
age_by_pclass_sex = df_all.groupby(['Sex', 'Pclass']).median()['Age']

for pclass in range(1, 4):
    for sex in ['female', 'male']:
        print('Median age of Pclass {} {}s: {}'.format(pclass, sex, age_by_pclass_sex[sex][pclass]))
print('Median age of all passengers: {}'.format(df_all['Age'].median()))

# Filling the missing values in Age with the medians of Sex and Pclass groups
df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

In [None]:
display_missing(df_all)

In [None]:
df_all['Embarked'] = df_all['Embarked'].fillna('S')
df_all['Fare'] = df_all['Fare'].fillna(0)

# Creating Deck column from the first letter of the Cabin column (M stands for Missing)
df_all['Deck'] = df_all['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

# Passenger in the T deck is changed to A
idx = df_all[df_all['Deck'] == 'T'].index
df_all.loc[idx, 'Deck'] = 'A'

# values in 'Deck' column are group by [A, BC, DR, FG, M]
df_all['Deck'] = df_all['Deck'].replace(['B', 'C'], 'BC')
df_all['Deck'] = df_all['Deck'].replace(['D', 'E'], 'DE')
df_all['Deck'] = df_all['Deck'].replace(['F', 'G'], 'FG')
# df_all['Deck'].value_counts()

# Dropping the Cabin feature
df_all.drop(['Cabin'], inplace=True, axis=1)

df_train, df_test = divide_df(df_all)
dfs = [df_train, df_test]

for df in dfs:
    display_missing(df)

In [None]:
ticket_counts = df_all['Ticket'].value_counts()

# "The fare for a single passenger" is the avg of the fare over the ticket frequency.
for t in ticket_counts.index:
    idx = df_all[df_all['Ticket']==t].index
    fare = df_all.loc[idx[0], 'Fare']
    avg = round(fare / ticket_counts[t], 4)
    df_all.loc[idx, 'avgFare'] = avg
    df_all.loc[idx, 'TicketFreq'] = ticket_counts[t]

# Compute the "survival rate for each ticket", if the needed data is missing, fill 0.3838 in.
# 0.3838 is the survival rate over all.
for t in ticket_counts.index:
    idx = df_all[df_all['Ticket']==t].index
    surv_notnull = sum(df_all.loc[idx, 'Survived'].notnull())
    if surv_notnull==0 or ticket_counts[t]<=2:
        df_all.loc[idx, 'TicketSurvivalRate'] = 0.3838
    else:
        try:
            survived = df_all.loc[idx, 'Survived'].value_counts()[1.0]
        except:
            survived = 0
        surv_rate = round(survived / surv_notnull, 4)
        df_all.loc[idx, 'TicketSurvivalRate'] = surv_rate

In [None]:
# Extract family surname.
df_all['Family'] = df_all['Name'].str.split(',', expand=True)[0]

family_counts = df_all['Family'].value_counts()

for name in family_counts.index:
    idx = df_all[df_all['Family']==name].index
    surv_notnull = sum(df_all.loc[idx, 'Survived'].notnull())
    if surv_notnull==0 or family_counts[name]<=2:
        df_all.loc[idx, 'FamilySurvivalRate'] = 0.3838
    else:
        try:
            survived = df_all.loc[idx, 'Survived'].value_counts()[1.0]
        except:
            survived = 0
        surv_rate = round(survived / surv_notnull, 4)
        df_all.loc[idx, 'FamilySurvivalRate'] = surv_rate

df_all['CompSurvivalRate'] = (df_all['TicketSurvivalRate'] + df_all['FamilySurvivalRate']) / 2

In [None]:
df_all['AgeLevel'] = pd.cut(df_all['Age'], [0, 15, 65, 100])
# sns.catplot(x="AgeLevel", y="Survived", data=df_all, kind="bar", palette="muted")

df_all['FareLevel'] = pd.qcut(df_all['avgFare'], 7)

# 'Title' has 6 classes: Mr, Mrs, Master, Miss, NobleMen, NobleWomen.
df_all['Title'] = df_all['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
df_all['Title'] = df_all['Title'].replace(['Miss', 'Ms', 'Mlle'], 'Miss')
df_all['Title'] = df_all['Title'].replace(['Rev', 'Dr', 'Col', 'Major', 
                                           'Capt', 'Don', 'Sir', 'Jonkheer'], 'NobleMen')
df_all['Title'] = df_all['Title'].replace(['Dona', 'Lady', 'Mme', 'the Countess'], 'NobleWomen')

# 'FamilySize' has 4 classes: Alone(1), Small(2-4), Medium(5-7), Large(8-).
df_all['FamilySize'] = df_all['SibSp'] + df_all['Parch'] + 1
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large', 8: 'Large', 11: 'Large'}
df_all['FamilySize'] = df_all['FamilySize'].map(family_map)

In [None]:
# -----------------------------------------------------------------
df_encode = df_all.copy()

non_numeric_features = ['Embarked', 'Sex', 'Deck', 'FamilySize', 'AgeLevel', 'FareLevel', 'Title']
for feature in non_numeric_features:
    df_encode[feature] = LabelEncoder().fit_transform(df_encode[feature])

In [None]:
drop_cols = ['Age', 'Fare', 'Name', 'PassengerId', 'Ticket', 'avgFare', 'Parch', 'SibSp', 'Family',
             'TicketSurvivalRate', 'FamilySurvivalRate']
df_encode.drop(columns=drop_cols, inplace=True)
df_encode.columns

In [None]:
col_order = ['Survived', 'Embarked', 'Pclass', 'Sex', 'Deck', 'TicketFreq', 'FamilySize', 'Title',
             'AgeLevel', 'FareLevel', 'CompSurvivalRate']
df_encode = df_encode.reindex(columns=col_order)

expand_cols = ['Embarked', 'Pclass', 'Sex', 'Deck', 'FamilySize', 'AgeLevel', 'Title']
df_encode = pd.get_dummies(df_encode, columns=expand_cols)

In [None]:
df_encode.loc[20, :]
# len(df_encode.loc[0, :])

In [None]:
df_train, df_test = divide_df(df_encode)
X_train = df_train.iloc[:, 1:]
y_train = df_train.iloc[:, 0]

print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('df_test shape: {}'.format(df_test.shape))

In [None]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=1000,
                             max_depth=7,
                             min_samples_split=10,
                             min_samples_leaf=6,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(X_train, y_train)
print("%.4f" % rf.oob_score_)

In [None]:
imp = rf.feature_importances_
imp_table = pd.DataFrame(X_train.columns)
imp_table[1] = imp
imp_table.sort_values(by=1, ascending=False)

In [None]:
res = rf.predict(df_test)
res = res.astype(int)
table = pd.read_csv('input/gender_submission.csv')
table['Survived'] = res
table.to_csv('submission.csv', index=False)