In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Weight of Evidence

Weight Of Evidence (WOE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default.
it is computed from the basic odds ratio: ln( (Proportio of Goods)/(Proportio of Bads)) 
WOE will be 0 if the P(Goods) / P(Bads) = 1. That is, if the outcome is random for that group. if P(Bads) >  P(Goods) the odds ratio will be < 1 and the WOE will be < 0; if, on the other hand P(Goods)> P(Bads) in a group, the WOE > 0.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

%matplotlib inline

In [4]:
data = pd.read_csv("/content/drive/MyDrive/Feature Engineering/titanic_train.csv", usecols=['Cabin', 'Survived'])
data.head()

Unnamed: 0,Survived,Cabin
0,1,C12239
1,0,
2,0,
3,0,
4,1,


In [5]:
# fill Na values with an additional label 

data.Cabin.fillna('Missing', inplace=True)
data.head()

Unnamed: 0,Survived,Cabin
0,1,C12239
1,0,Missing
2,0,Missing
3,0,Missing
4,1,Missing


In [6]:
len(data.Cabin.unique())

26993

In [7]:
data['Cabin'] =  data['Cabin'].astype(str).str[0]
data.head()

Unnamed: 0,Survived,Cabin
0,1,C
1,0,M
2,0,M
3,0,M
4,1,M


In [8]:
data.Cabin.unique()

array(['C', 'M', 'A', 'D', 'B', 'E', 'F', 'G', 'T'], dtype=object)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data[['Cabin', 'Survived']], data.Survived, test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((70000, 2), (30000, 2))

In [10]:
X_train.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.273529
B    0.627943
C    0.709877
D    0.733488
E    0.612613
F    0.830769
G    0.860606
M    0.341836
T    0.520000
Name: Survived, dtype: float64

In [11]:
prob_df = X_train.groupby(['Cabin'])['Survived'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.273529
B,0.627943
C,0.709877
D,0.733488
E,0.612613
F,0.830769
G,0.860606
M,0.341836
T,0.52


In [12]:
prob_df = X_train.groupby(['Cabin'])['Survived'].mean()
prob_df = pd.DataFrame(prob_df)
prob_df['Died'] = 1 - prob_df.Survived
prob_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.273529,0.726471
B,0.627943,0.372057
C,0.709877,0.290123
D,0.733488,0.266512
E,0.612613,0.387387
F,0.830769,0.169231
G,0.860606,0.139394
M,0.341836,0.658164
T,0.52,0.48


In [13]:
prob_df.loc[prob_df.Survived==0, 'Survived'] = 0.00001

In [14]:
prob_df['WoE'] = np.log(prob_df.Survived/prob_df.Died)
prob_df

Unnamed: 0_level_0,Survived,Died,WoE
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.273529,0.726471,-0.976789
B,0.627943,0.372057,0.523401
C,0.709877,0.290123,0.894785
D,0.733488,0.266512,1.012391
E,0.612613,0.387387,0.458308
F,0.830769,0.169231,1.591089
G,0.860606,0.139394,1.820333
M,0.341836,0.658164,-0.655121
T,0.52,0.48,0.080043


In [15]:
# make a dictionary with the above ratio
prob_df['WoE'].to_dict()

{'A': -0.9767888434747211,
 'B': 0.5234005479407174,
 'C': 0.8947845266531916,
 'D': 1.0123911132209809,
 'E': 0.4583075894825441,
 'F': 1.5910887737659043,
 'G': 1.820332841672111,
 'M': -0.6551212180368441,
 'T': 0.08004270767353656}

In [16]:
ordered_labels = prob_df['WoE'].to_dict()

In [17]:
## replace the labels with the ordred numbers 
X_train['Cabin_ordered'] = X_train.Cabin.map(ordered_labels)
X_test['Cabin_ordered'] = X_test.Cabin.map(ordered_labels)

In [18]:
X_train.head()

Unnamed: 0,Cabin,Survived,Cabin_ordered
42865,M,0,-0.655121
91446,M,0,-0.655121
66374,M,0,-0.655121
79106,C,0,0.894785
13404,M,1,-0.655121


In [19]:
X_train

Unnamed: 0,Cabin,Survived,Cabin_ordered
42865,M,0,-0.655121
91446,M,0,-0.655121
66374,M,0,-0.655121
79106,C,0,0.894785
13404,M,1,-0.655121
...,...,...,...
21243,M,1,-0.655121
45891,A,0,-0.976789
42613,M,0,-0.655121
43567,C,0,0.894785
