In [2]:
import numpy as np
import pandas as pd
import sys 
import os
sys.path.append(os.path.abspath("../src"))

import functions as fnc

In [3]:
data = pd.read_csv(r'../data/processed/train.csv')

In [4]:
pd.set_option('display.max_columns',50)

In [5]:
candidates = ['Pclass','Sex',
 'SibSp',
 'Parch',
 'Embarked',
 'J_missing_age',
 'J_age_binned',
 'J_fare_binned',
 'J_title_grouped',
 'J_nrelatives',
 'J_ticket_prefix',
 'J_ticket_location',
 'J_cabin_letter',
 'J_n_siblings_spouses']

In [6]:
data['died'] = 0
data.loc[data['Survived']==0, 'died'] = 1

## missing value cut

In [7]:
(100*data.isnull().sum()/data.shape[0]).to_frame("% Missing").sort_values(by='% Missing',
                                                                          ascending=False).round(2)

Unnamed: 0,% Missing
J_ticket_location,94.73
J_room_number,77.55
Cabin,77.1
Age,19.87
Embarked,0.22
PassengerId,0.0
J_fare_binned,0.0
J_n_siblings_spouses,0.0
J_cabin_letter,0.0
J_ticket_number,0.0


my ticket location column, and to a lesser extent room number and cabin has high % missing values- it is possible they may still be predictive as a missing Y/N flag but unlikely. Consider removing to increase model stability

## information value analysis

In [8]:
IVs = {}
woe_tables = {}
for c in candidates:
    print(c)
    IVs[c] = {}

    IVs[c]['woe_table'], IVs[c]['woe_grad'] = fnc.calculate_woe_for_column(data, c,target = 'Survived')
    IVs[c]['IV']= IVs[c]['woe_table']['IV_i'].sum()

Pclass
Sex
SibSp
Parch
Embarked
J_missing_age
J_age_binned
J_fare_binned
J_title_grouped
J_nrelatives
J_ticket_prefix
J_ticket_location
J_cabin_letter
J_n_siblings_spouses


In [9]:
iv_summary_table = pd.DataFrame({c: IVs[c]['IV'] for c in list(IVs.keys())}, index=['IV']).T
iv_summary_table.sort_values(by='IV', ascending=False).round(2)

Unnamed: 0,IV
J_title_grouped,149.71
Sex,134.17
J_fare_binned,62.59
Pclass,50.09
J_cabin_letter,27.8
J_n_siblings_spouses,23.59
J_age_binned,12.58
Embarked,12.27
J_ticket_location,11.28
SibSp,11.15


In [10]:
iv_summary_table.sort_values(by='IV',ascending = False, inplace=True)

we can see that our new title column is very predictive (the function removed the items with <5% of sample in them), however adding the 'miss' has added value compared to male/female (as we saw before, young women have a very high chance of surviving.

Here Bad == Survived

we could consider cutting those with very low iv (e.g. <5) however it is possible that xgboost will find some relationship using multiple features

In [11]:
IVs['J_title_grouped']['woe_table']

Survived,Good,Bad,Bad rate,Good+Bad,Good%-Bad%,WoE,IV_i
J_title_grouped,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Miss,0.100182,0.380117,0.702703,0.207632,-27.993481,-1.333489,37.328998
Mr,0.795993,0.236842,0.156371,0.581369,55.915061,1.212196,67.780032
Mrs,0.047359,0.292398,0.793651,0.141414,-24.503883,-1.820361,44.605921


## correlation analysis

In [26]:
c = fnc.CorrelationAnalysis(data, iv_summary_table)
selected, rejected = c.analyse_correlations()

running for spearman
SibSp correlated with J_n_siblings_spouses
Parch correlated with J_n_siblings_spouses
J_n_siblings_spouses correlated with SibSp
J_n_siblings_spouses correlated with Parch
running for kendall
SibSp correlated with J_n_siblings_spouses
J_n_siblings_spouses correlated with SibSp
running for pearson
SibSp correlated with J_n_siblings_spouses
J_n_siblings_spouses correlated with SibSp
{'SibSp': ['J_n_siblings_spouses'], 'Parch': ['J_n_siblings_spouses'], 'J_n_siblings_spouses': ['Parch', 'SibSp']}
filtering for SibSp
rejected ['SibSp']
filtering for Parch
rejected ['Parch']
filtering for J_n_siblings_spouses


In [27]:
print('we can consider removing the highly correlated variables: {}. in my experience with xgboost it will not make too much difference, however if we were using a logistic regression i would do this.'.format(rejected))

we can consider removing the highly correlated variables: ['SibSp', 'Parch']. in my experience with xgboost it will not make too much difference, however if we were using a logistic regression i would do this.
