In [7]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime, timedelta

## Types of Encoding:
- Ordinal Date encoding
- Frequency/Count encoding (nominal)
- Label encoding (ordinal)
- One Hot encoding (nominal)
- Mean encoding (nominal)
- Non-Target guided encoding (ordinal)
- Target guided encoding (ordinal)
- Probability Ratio encoding (ordinal)

In [2]:
data = pd.read_csv("data/titanic.csv", delimiter=",")

In [3]:
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Ordinal Date Encoding

In [6]:
today = datetime.now()
today

datetime.datetime(2024, 3, 6, 19, 47, 11, 843013)

In [8]:
yesterday = today - timedelta(1)
yesterday

datetime.datetime(2024, 3, 5, 19, 47, 11, 843013)

In [12]:
days = [today - timedelta(x) for x in range(0,15)]

In [13]:
df = pd.DataFrame(days, columns=['Days'])
df.head(5)

Unnamed: 0,Days
0,2024-03-06 19:47:11.843013
1,2024-03-05 19:47:11.843013
2,2024-03-04 19:47:11.843013
3,2024-03-03 19:47:11.843013
4,2024-03-02 19:47:11.843013


In [16]:
df['Week_day'] = df['Days'].dt.day_name()

In [17]:
df.head(5)

Unnamed: 0,Days,Week_day
0,2024-03-06 19:47:11.843013,Wednesday
1,2024-03-05 19:47:11.843013,Tuesday
2,2024-03-04 19:47:11.843013,Monday
3,2024-03-03 19:47:11.843013,Sunday
4,2024-03-02 19:47:11.843013,Saturday


In [18]:
diction = {
    'Monday': 1, 
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
    }

In [19]:
df['Days_of_the_Week'] = df['Week_day'].map(diction) 
df.head(5)

Unnamed: 0,Days,Week_day,Days_of_the_Week
0,2024-03-06 19:47:11.843013,Wednesday,3
1,2024-03-05 19:47:11.843013,Tuesday,2
2,2024-03-04 19:47:11.843013,Monday,1
3,2024-03-03 19:47:11.843013,Sunday,7
4,2024-03-02 19:47:11.843013,Saturday,6


## Frequency / Count Encoding

In [50]:
data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, index_col=False)

In [51]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [52]:
data.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
dtype: int64

In [53]:
data[1].value_counts()

1
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

In [54]:
len(data[1].unique())

9

In [55]:
columns = [1,3,5,6,7,8,9,13]
data = data[columns]

In [56]:
data.columns = ['Employment','Degree','Status','Designation','family_job','Race','Sex','Country']

In [57]:
data

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [58]:
for col in data.columns:
    print(col, ":", len(data[col].unique()), 'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [59]:
data['Country'].value_counts()

Country
United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France              

In [60]:
frequency_map = data['Country'].value_counts().to_dict()
frequency_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [61]:
data

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [62]:
data['Country'] = data['Country'].map(frequency_map)
data.head(5)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95


### Advantages:
- Easy to implement
- Doesnt increase the feature space (no addition feature columns like one hot encoding)
### Disadvantages:
- same weight provided if the frequencies are same

## One Hot encoding

In [63]:
data.head(3)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170


In [64]:
gender = pd.get_dummies(data['Sex'], drop_first=True, prefix="Sex_")
data = data.drop(columns=['Sex'])
data = pd.concat([data,gender], axis=1)

In [65]:
data.head(10)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Country,Sex__ Male
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,29170,True
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,29170,True
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,29170,True
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,29170,True
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,95,False
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,29170,False
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,81,False
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,29170,True
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,29170,False
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,29170,True


In [69]:
data['Sex__ Male'] = np.where(data['Sex__ Male'] == True, 1,0)
data.head(10)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Country,Sex__ Male
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,29170,1
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,29170,1
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,29170,1
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,29170,1
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,95,0
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,29170,0
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,81,0
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,29170,1
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,29170,0
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,29170,1


In [71]:
data['Designation'].value_counts()

Designation
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
?                    1843
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: count, dtype: int64

## Label Encoding:

In [72]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['Designation'] = encoder.fit_transform(data['Designation'])
data.head(10)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Country,Sex__ Male
0,State-gov,Bachelors,Never-married,1,Not-in-family,White,29170,1
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,4,Husband,White,29170,1
2,Private,HS-grad,Divorced,6,Not-in-family,White,29170,1
3,Private,11th,Married-civ-spouse,6,Husband,Black,29170,1
4,Private,Bachelors,Married-civ-spouse,10,Wife,Black,95,0
5,Private,Masters,Married-civ-spouse,4,Wife,White,29170,0
6,Private,9th,Married-spouse-absent,8,Not-in-family,Black,81,0
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,4,Husband,White,29170,1
8,Private,Masters,Never-married,10,Not-in-family,White,29170,0
9,Private,Bachelors,Married-civ-spouse,4,Husband,White,29170,1


## Mean Encoding

In [73]:
data.head(5)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Country,Sex__ Male
0,State-gov,Bachelors,Never-married,1,Not-in-family,White,29170,1
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,4,Husband,White,29170,1
2,Private,HS-grad,Divorced,6,Not-in-family,White,29170,1
3,Private,11th,Married-civ-spouse,6,Husband,Black,29170,1
4,Private,Bachelors,Married-civ-spouse,10,Wife,Black,95,0


In [85]:
info = data['Race'].value_counts()
info

Race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64

In [89]:
count = data.count()
count[0]

  count[0]


32561

In [108]:
mean_map = dict()
for index,x in zip(info.index,info):
    diction = {index:(x/count[0])}
    mean_map.update(diction)

  diction = {index:(x/count[0])}


In [109]:
mean_map

{' White': 0.8542735173981143,
 ' Black': 0.0959429992936335,
 ' Asian-Pac-Islander': 0.03190933939375326,
 ' Amer-Indian-Eskimo': 0.009551303706888609,
 ' Other': 0.008322840207610331}

In [110]:
data['Race'] = data['Race'].map(mean_map)
data.head(5)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Country,Sex__ Male
0,State-gov,Bachelors,Never-married,1,Not-in-family,0.854274,29170,1
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,4,Husband,0.854274,29170,1
2,Private,HS-grad,Divorced,6,Not-in-family,0.854274,29170,1
3,Private,11th,Married-civ-spouse,6,Husband,0.095943,29170,1
4,Private,Bachelors,Married-civ-spouse,10,Wife,0.095943,95,0


## Non-Target Guided Encoding

In [111]:
degree = data['Degree'].value_counts()
degree

Degree
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64

In [112]:
count = data.count()
target_mean_map = dict()
for index,x in zip(degree.index,degree):
    diction = {index:(x/count[0])}
    target_mean_map.update(diction)

  diction = {index:(x/count[0])}


In [116]:
data['Degree'] = data['Degree'].map(target_mean_map)
data.head(5)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Country,Sex__ Male
0,State-gov,0.164461,Never-married,1,Not-in-family,0.854274,29170,1
1,Self-emp-not-inc,0.164461,Married-civ-spouse,4,Husband,0.854274,29170,1
2,Private,0.322502,Divorced,6,Not-in-family,0.854274,29170,1
3,Private,0.036086,Married-civ-spouse,6,Husband,0.095943,29170,1
4,Private,0.164461,Married-civ-spouse,10,Wife,0.095943,95,0


In [125]:
values = data['Degree'].value_counts().sort_values(ascending=False)
rank_map = {}
i=0
for index,x in zip(values.index,values):
    i=i+1
    rank_map.update({index:i})
rank_map

{0.32250238014802984: 1,
 0.22391818433094807: 2,
 0.16446055096587942: 3,
 0.05291606523141181: 4,
 0.042443413900064494: 5,
 0.03608611529129941: 6,
 0.032769263843248055: 7,
 0.02865391112066583: 8,
 0.019839685513344186: 9,
 0.0176898743896072: 10,
 0.01578575596572587: 11,
 0.013298117379687356: 12,
 0.012683885630048217: 13,
 0.010226958631491662: 14,
 0.005159546696968767: 15,
 0.001566290961579804: 16}

In [126]:
data['Degree'] = data['Degree'].map(rank_map)
data.head(10)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Country,Sex__ Male
0,State-gov,3,Never-married,1,Not-in-family,0.854274,29170,1
1,Self-emp-not-inc,3,Married-civ-spouse,4,Husband,0.854274,29170,1
2,Private,1,Divorced,6,Not-in-family,0.854274,29170,1
3,Private,6,Married-civ-spouse,6,Husband,0.095943,29170,1
4,Private,3,Married-civ-spouse,10,Wife,0.095943,95,0
5,Private,4,Married-civ-spouse,4,Wife,0.854274,29170,0
6,Private,11,Married-spouse-absent,8,Not-in-family,0.095943,81,0
7,Self-emp-not-inc,1,Married-civ-spouse,4,Husband,0.854274,29170,1
8,Private,4,Never-married,10,Not-in-family,0.854274,29170,0
9,Private,3,Married-civ-spouse,4,Husband,0.854274,29170,1


## Target Guided Encoding

In [161]:
data = pd.read_csv("data/titanic.csv", delimiter=",")
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [162]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [163]:
len(data['Cabin'].unique())

148

In [164]:
data['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [165]:
data['Cabin'] = data['Cabin'].fillna('M')

In [166]:
import re
data['Cabin_re'] = data['Cabin'].apply(lambda x: re.sub(r'[^A-Za-z]','',str(x)))
data

# data['Cabin'] = data['Cabin'].astype(str).str[0]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_re
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,M,S,M
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,M,S,M
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,M,S,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,M,S,M
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,B
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,M,S,M
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,C


In [167]:
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_re
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,M,S,M
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,M,S,M
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,M,S,M


In [169]:
data['Cabin_re'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'CCC', 'B', 'F', 'FG', 'DD', 'BB',
       'FE', 'CC', 'BBBB', 'T', 'BBB'], dtype=object)

In [171]:
data['Cabin_re'] = data['Cabin_re'].astype(str).str[0]

In [172]:
data['Cabin_re'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [175]:
data_values = data.groupby(['Cabin_re'])['Survived'].mean().sort_values().index

In [177]:
data_cabin_map = {i:k for k,i in enumerate(data_values,0)}
data_cabin_map

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [178]:
data['Cabin_re'] = data['Cabin_re'].map(data_cabin_map)
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_re
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,M,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,4
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,M,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,4
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,M,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,M,S,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,6
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,M,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,4


In [179]:
data['Cabin_re'].unique()

array([1, 4, 7, 3, 8, 2, 6, 5, 0])

## Probability Ratio Encoding

In [192]:
data = pd.read_csv("data/titanic.csv", usecols=["Cabin","Survived"])

In [193]:
data.head(10)

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,
5,0,
6,0,E46
7,0,
8,1,
9,1,


In [194]:
data['Cabin'] = data['Cabin'].fillna('Missing')

In [195]:
data.head(10)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
5,0,Missing
6,0,E46
7,0,Missing
8,1,Missing
9,1,Missing


In [196]:
data.isnull().sum()

Survived    0
Cabin       0
dtype: int64

In [197]:
data['Cabin'].unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [198]:
data['Cabin'] = data['Cabin'].astype(str).str[0]
data['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [199]:
import pandas as pd
probability_map = data.groupby(['Cabin'])['Survived'].mean()
probability_map_df = pd.DataFrame(probability_map)
probability_map_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [200]:
probability_map_df['Died'] = 1 - probability_map_df['Survived']
probability_map_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [201]:
probability_map_df['Probability_ratio'] = probability_map_df['Survived'] / probability_map_df['Died']
probability_map_df

Unnamed: 0_level_0,Survived,Died,Probability_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [203]:
probability_map = probability_map_df['Probability_ratio'].to_dict()
probability_map

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [204]:
data['Cabin'] = data['Cabin'].map(probability_map)

In [205]:
data

Unnamed: 0,Survived,Cabin
0,0,0.428274
1,1,1.458333
2,1,0.428274
3,1,1.458333
4,0,0.428274
...,...,...
886,0,0.428274
887,1,2.916667
888,0,0.428274
889,1,1.458333
