## Handling Categorical Features

### One Hot Encoding

In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [71]:
df = pd.read_csv("titanic_train.csv",usecols=['Sex'])
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [72]:
pd.get_dummies(df).head()

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [73]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [74]:
df = pd.read_csv("titanic_train.csv",usecols=["Embarked"])
df.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [75]:
df.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [76]:
df.dropna(inplace=True)

In [77]:
pd.get_dummies(df).head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [78]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [79]:
df = pd.read_csv("train.csv",usecols=['X0','X1',"X2",'X3',"X4","X5",'X6'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [80]:
## printing count of unique labels for each variable
for i in df.columns:
    print(len(df[i].unique()))

47
27
44
7
4
29
12


In [81]:
def one_hot_encoding(df,var):
    if len(df[var]) >10:
        Top_10_labels = df[var].value_counts().sort_values(ascending=False).head(10).index
    
    else:
         Top_10_labels = df[var].value_counts().sort_values(ascending=False).index
    
    for label in Top_10_labels:
        df[var+'_'+label] = np.where(df[var]==label,1,0) 
    

In [82]:
for i in ['X0','X1','X2','X3','X4','X5','X6']:
    one_hot_encoding(df,i)
    

In [83]:
df.columns

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X0_z', 'X0_ak', 'X0_y',
       'X0_ay', 'X0_t', 'X0_x', 'X0_o', 'X0_f', 'X0_n', 'X0_w', 'X1_aa',
       'X1_s', 'X1_b', 'X1_l', 'X1_v', 'X1_r', 'X1_i', 'X1_a', 'X1_c', 'X1_o',
       'X2_as', 'X2_ae', 'X2_ai', 'X2_m', 'X2_ak', 'X2_r', 'X2_n', 'X2_s',
       'X2_f', 'X2_e', 'X3_c', 'X3_f', 'X3_a', 'X3_d', 'X3_g', 'X3_e', 'X3_b',
       'X4_d', 'X4_a', 'X4_c', 'X4_b', 'X5_v', 'X5_w', 'X5_q', 'X5_r', 'X5_d',
       'X5_s', 'X5_n', 'X5_p', 'X5_m', 'X5_i', 'X6_g', 'X6_j', 'X6_d', 'X6_i',
       'X6_l', 'X6_a', 'X6_h', 'X6_k', 'X6_c', 'X6_b'],
      dtype='object')

In [84]:
df.shape

(4209, 68)

In [85]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X0_z,X0_ak,X0_y,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,k,v,at,a,d,u,j,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [86]:
df.drop(columns=["X0",'X1',"X2","X3","X4","X5","X6"])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X0_z,X0_ak,X0_y,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,k,v,at,a,d,u,j,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,...,0,0,1,0,0,0,0,0,0,0


## Ordinal Number Encoding 

In [87]:
import datetime

In [88]:
today_date = datetime.datetime.today()

In [89]:
today_date

datetime.datetime(2021, 4, 29, 13, 22, 24, 420591)

In [90]:
today_date - datetime.timedelta(1)

datetime.datetime(2021, 4, 28, 13, 22, 24, 420591)

In [91]:
today_date - datetime.timedelta(2)

datetime.datetime(2021, 4, 27, 13, 22, 24, 420591)

In [92]:
days = [today_date - datetime.timedelta(x) for x in range(0,15)]

In [93]:
data = pd.DataFrame(days)
data.columns = ['Day']
data.head()

Unnamed: 0,Day
0,2021-04-29 13:22:24.420591
1,2021-04-28 13:22:24.420591
2,2021-04-27 13:22:24.420591
3,2021-04-26 13:22:24.420591
4,2021-04-25 13:22:24.420591


In [94]:
data['weekday']=data['Day'].dt.day_name()
data.head()

Unnamed: 0,Day,weekday
0,2021-04-29 13:22:24.420591,Thursday
1,2021-04-28 13:22:24.420591,Wednesday
2,2021-04-27 13:22:24.420591,Tuesday
3,2021-04-26 13:22:24.420591,Monday
4,2021-04-25 13:22:24.420591,Sunday


In [95]:
dictionary ={'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,
             'Saturday':6,'Sunday':7
    
}

In [96]:
dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [97]:
data['weekday_ordinal'] = data["weekday"].map(dictionary)
data.head()

Unnamed: 0,Day,weekday,weekday_ordinal
0,2021-04-29 13:22:24.420591,Thursday,4
1,2021-04-28 13:22:24.420591,Wednesday,3
2,2021-04-27 13:22:24.420591,Tuesday,2
3,2021-04-26 13:22:24.420591,Monday,1
4,2021-04-25 13:22:24.420591,Sunday,7


## Count/Frequency Encoding

In [98]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [99]:
columns=[1,3,5,6,7,8,9,13]
train_set = train_set[columns]


In [100]:
train_set.columns =['Employment','Degree','Marital_status','Designation','Family_job','Race','Sex','Country']

In [101]:
train_set.head()

Unnamed: 0,Employment,Degree,Marital_status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [102]:
for feature in train_set.columns:
    print(feature,' : ',len(train_set[feature].unique()),'labels')

Employment  :  9 labels
Degree  :  16 labels
Marital_status  :  7 labels
Designation  :  15 labels
Family_job  :  6 labels
Race  :  5 labels
Sex  :  2 labels
Country  :  42 labels


In [103]:
train_set['Country'].value_counts().to_dict()

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [104]:
train_set['Country'] = np.where(train_set['Country']==' ?',"Others",train_set['Country'])
train_set.Country.value_counts()

 United-States                 29170
 Mexico                          643
Others                           583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 

In [105]:
country_map = train_set['Country'].value_counts().to_dict()

In [106]:
train_set['Country'] = train_set['Country'].map(country_map) 

In [107]:
train_set.head(20)

Unnamed: 0,Employment,Degree,Marital_status,Designation,Family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


#### Advantages
1. Easy to Use
2. Not increasing any feature space

#### Disadvantages
1. It will provide the same weight if the frequencies are same

## Target Guided Ordinal Encoding
1. Ordering the labels according to the target
2. replace the labels by the joint probability of being 1 or 0

In [108]:
df = pd.read_csv("titanic_train.csv",usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [109]:
df.Cabin.fillna('Missing',inplace=True)
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [110]:
df['Cabin'] = df.Cabin.astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [111]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [112]:
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [113]:
df.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [114]:
ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index

In [115]:
ordinal_labels2 = {k:i for i,k in enumerate(ordinal_labels)}
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [116]:
df['Cabin_ordinal_labels'] = df.Cabin.map(ordinal_labels2)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


## Mean Encoding

In [117]:
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [118]:
mean_ordinal = df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [119]:
df['mean_ordinal_encode'] = df['Cabin'].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,mean_ordinal_encode
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


#### 

## Probability Ratio Encoding
1. Probability of survived based on Cabin ---- categorical feature
2. probability of Not survived ---- 1- p(survived)
3. p(survived)/p(Not survived)
4. dictionary to map cabin with probability
5. replace with the categorical feature

In [120]:
df = pd.read_csv('titanic_train.csv',usecols=['Cabin','Survived'])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [121]:
df['Cabin'].fillna('Missing',inplace=True)
df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [122]:
df.Cabin.unique()

array(['Missing', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62

In [123]:
df.Cabin = df.Cabin.astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [124]:
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [125]:
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [126]:
prob = df.groupby(['Cabin'])['Survived'].mean()

In [127]:
prob = pd.DataFrame(prob)
prob

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [128]:
prob["Died"] = 1-prob['Survived']
prob

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [129]:
prob['Probability_Ratio'] = prob['Survived']/prob["Died"]
prob

Unnamed: 0_level_0,Survived,Died,Probability_Ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [130]:
prob.Probability_Ratio.to_dict()

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [131]:
map_prob = prob.Probability_Ratio.to_dict()

In [132]:
df['Cabin_Encoded'] = df['Cabin'].map(map_prob)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_Encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
