## One Hot Encoding

This technique creates a binary column for each category in a categorical feature. 

If there are n categories in a feature, n binary columns are created. 

For example, if we have a feature called "Color" with categories Red, Green, and Blue, we would create three binary columns: Color_Red, Color_Green, and Color_Blue. A value of 1 is assigned to the column that corresponds to the category of the observation, and 0 to all other columns.

In [1]:
import pandas as pd

In [283]:
titanic1 = pd.read_csv("D:\\complete machine learning course\\titanic\\train.csv", usecols = ['Sex'])
titanic1.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [4]:
type(titanic1)

pandas.core.frame.DataFrame

In [5]:
pd.get_dummies(data = titanic1['Sex'], prefix= 'sex', drop_first=True).head()

Unnamed: 0,sex_male
0,1
1,0
2,0
3,0
4,1


In [6]:
titanic2 = pd.read_csv("D:\\complete machine learning course\\titanic\\train.csv", usecols = ['Embarked'])
titanic2.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [7]:
titanic2['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [8]:
titanic2.value_counts()

Embarked
S           644
C           168
Q            77
dtype: int64

In [9]:
titanic2_copy = pd.get_dummies(data = titanic2['Embarked'], prefix = 'Embarked', drop_first=True)
titanic2_copy.head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [10]:
titanic2 = pd.concat([titanic2, titanic2_copy], axis = 1)

In [11]:
titanic2.head()

Unnamed: 0,Embarked,Embarked_Q,Embarked_S
0,S,0,1
1,C,0,0
2,S,0,1
3,S,0,1
4,S,0,1


In [12]:
titanic2.drop('Embarked', axis = 1, inplace = True)

In [13]:
titanic2.head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [14]:
df = pd.read_csv("D:\\complete machine learning course\\Feature-Engineering-Live-sessions-master\\Feature-Engineering-Live-sessions-master\\mercedes.csv", usecols = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5
0,k,v,at,a,d,u
1,k,t,av,e,d,y
2,az,w,n,c,d,x
3,az,t,n,f,d,x
4,az,v,n,f,d,h


In [15]:
df.columns

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5'], dtype='object')

In [16]:
df['X0'].value_counts()

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
j     181
az    175
aj    151
s     106
ap    103
h      75
d      73
al     67
v      36
af     35
m      34
ai     34
e      32
ba     27
at     25
a      21
ax     19
aq     18
am     18
i      18
u      17
aw     16
l      16
ad     14
au     11
k      11
b      11
r      10
as     10
bc      6
ao      4
c       3
aa      2
q       2
ac      1
g       1
ab      1
Name: X0, dtype: int64

In [17]:
df['X0'].value_counts().index

Index(['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w', 'j', 'az', 'aj',
       's', 'ap', 'h', 'd', 'al', 'v', 'af', 'm', 'ai', 'e', 'ba', 'at', 'a',
       'ax', 'aq', 'am', 'i', 'u', 'aw', 'l', 'ad', 'au', 'k', 'b', 'r', 'as',
       'bc', 'ao', 'c', 'aa', 'q', 'ac', 'g', 'ab'],
      dtype='object')

In [28]:
pd.set_option('display.max_columns',300)
abc = pd.get_dummies(data = df['X0'])
abc.head()

Unnamed: 0,a,aa,ab,ac,ad,af,ai,aj,ak,al,am,ao,ap,aq,as,at,au,aw,ax,ay,az,b,ba,bc,c,d,e,f,g,h,i,j,k,l,m,n,o,q,r,s,t,u,v,w,x,y,z
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
df1 = df.copy()
df1 = pd.concat([df1, abc],axis = 1)

In [33]:
df1.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,a,aa,ab,ac,ad,af,ai,aj,ak,al,am,ao,ap,aq,as,at,au,aw,ax,ay,az,b,ba,bc,c,d,e,f,g,h,i,j,k,l,m,n,o,q,r,s,t,u,v,w,x,y,z
0,k,v,at,a,d,u,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,k,t,av,e,d,y,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [34]:
df1.drop(['X1','X2','X3','X4','X5'], axis = 1, inplace = True)

In [35]:
df1.head()

Unnamed: 0,X0,a,aa,ab,ac,ad,af,ai,aj,ak,al,am,ao,ap,aq,as,at,au,aw,ax,ay,az,b,ba,bc,c,d,e,f,g,h,i,j,k,l,m,n,o,q,r,s,t,u,v,w,x,y,z
0,k,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,k,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,az,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,az,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,az,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


##### For single feature if we are generating these many columns it may lead to overfitting and curse of dimensionalty

# Ordinal Number Encoding

In [36]:
import datetime

In [72]:
today = datetime.datetime.today()
print(today)

now = datetime.datetime.now()
print(now)


2023-04-02 16:52:42.189481
2023-04-02 16:52:42.190484


In [61]:
date = today.date()

print('date : ', date)

print('year : ', today.year)
print('year : ', date.year)

print('month : ', today.month)
print('month : ', date.month)

print('day   : ', today.day)
print('day   : ', date.day)

date :  2023-04-02
year :  2023
year :  2023
month :  4
month :  4
day   :  2
day   :  2


In [74]:
time = today.time()

print('time : ', time)

print('hour : ', time.hour)
print('hour : ', today.hour)

print('min  : ', time.minute)
print('min  : ', today.minute)

print('sec  : ', time.second)
print('sec  : ', today.second)

print('microsecond : ', time.microsecond)
print('microsecond : ', today.microsecond)


time :  16:52:42.189481
hour :  16
hour :  16
min  :  52
min  :  52
sec  :  42
sec  :  42
microsecond :  189481
microsecond :  189481


In [81]:
delta = datetime.timedelta(3)
delta

datetime.timedelta(days=3)

In [90]:
last_3_days = today - delta
last_3_days

datetime.datetime(2023, 3, 30, 16, 52, 42, 189481)

In [97]:
print(today.day)
print(last_3_days.day)
last_3_days.date() - today.date()

2
30


datetime.timedelta(days=-3)

In [114]:
days = [today - datetime.timedelta(i) for i in range(1,10)]
days

[datetime.datetime(2023, 4, 1, 16, 52, 42, 189481),
 datetime.datetime(2023, 3, 31, 16, 52, 42, 189481),
 datetime.datetime(2023, 3, 30, 16, 52, 42, 189481),
 datetime.datetime(2023, 3, 29, 16, 52, 42, 189481),
 datetime.datetime(2023, 3, 28, 16, 52, 42, 189481),
 datetime.datetime(2023, 3, 27, 16, 52, 42, 189481),
 datetime.datetime(2023, 3, 26, 16, 52, 42, 189481),
 datetime.datetime(2023, 3, 25, 16, 52, 42, 189481),
 datetime.datetime(2023, 3, 24, 16, 52, 42, 189481)]

In [120]:
dates = pd.DataFrame(days,columns=['Day'])
dates

Unnamed: 0,Day
0,2023-04-01 16:52:42.189481
1,2023-03-31 16:52:42.189481
2,2023-03-30 16:52:42.189481
3,2023-03-29 16:52:42.189481
4,2023-03-28 16:52:42.189481
5,2023-03-27 16:52:42.189481
6,2023-03-26 16:52:42.189481
7,2023-03-25 16:52:42.189481
8,2023-03-24 16:52:42.189481


In [123]:
dates['Day'].dtype

dtype('<M8[ns]')

In [137]:
dates['weekday'] = dates['Day'].dt.strftime('%A')
dates

Unnamed: 0,Day,weekday
0,2023-04-01 16:52:42.189481,Saturday
1,2023-03-31 16:52:42.189481,Friday
2,2023-03-30 16:52:42.189481,Thursday
3,2023-03-29 16:52:42.189481,Wednesday
4,2023-03-28 16:52:42.189481,Tuesday
5,2023-03-27 16:52:42.189481,Monday
6,2023-03-26 16:52:42.189481,Sunday
7,2023-03-25 16:52:42.189481,Saturday
8,2023-03-24 16:52:42.189481,Friday


In [170]:
dictionary = dict(zip(dates['weekday'].value_counts().index, range(1,8)))
print(dictionary.keys())

dict_keys(['Saturday', 'Friday', 'Thursday', 'Wednesday', 'Tuesday', 'Monday', 'Sunday'])


In [154]:
dates['weekday_encode'] = dates['weekday'].map(dictionary)

In [156]:
dates

Unnamed: 0,Day,weekday,weekday_encode
0,2023-04-01 16:52:42.189481,Saturday,1
1,2023-03-31 16:52:42.189481,Friday,2
2,2023-03-30 16:52:42.189481,Thursday,3
3,2023-03-29 16:52:42.189481,Wednesday,4
4,2023-03-28 16:52:42.189481,Tuesday,5
5,2023-03-27 16:52:42.189481,Monday,6
6,2023-03-26 16:52:42.189481,Sunday,7
7,2023-03-25 16:52:42.189481,Saturday,1
8,2023-03-24 16:52:42.189481,Friday,2


In [179]:
import sklearn
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder()
# if order is not matter
cols = ['weekday']
dates['weekday_ordinal'] = oe.fit_transform(dates[cols])

In [180]:
oe = OrdinalEncoder(categories=[['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']])
# when order has importance 
cols = ['weekday']
dates['weekday_ordinal_specified'] = oe.fit_transform(dates[cols])

In [182]:
dates

Unnamed: 0,Day,weekday,weekday_encode,weekday_ordinal,weekday_ordinal_specified,weekday.1
0,2023-04-01 16:52:42.189481,Saturday,1,2.0,6.0,2.0
1,2023-03-31 16:52:42.189481,Friday,2,0.0,5.0,0.0
2,2023-03-30 16:52:42.189481,Thursday,3,4.0,4.0,4.0
3,2023-03-29 16:52:42.189481,Wednesday,4,6.0,3.0,6.0
4,2023-03-28 16:52:42.189481,Tuesday,5,5.0,2.0,5.0
5,2023-03-27 16:52:42.189481,Monday,6,1.0,1.0,1.0
6,2023-03-26 16:52:42.189481,Sunday,7,3.0,0.0,3.0
7,2023-03-25 16:52:42.189481,Saturday,1,2.0,6.0,2.0
8,2023-03-24 16:52:42.189481,Friday,2,0.0,5.0,0.0


In [194]:
titanic = pd.read_csv("D:\\complete machine learning course\\titanic\\train.csv",usecols = ['Sex'])
titanic.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [195]:
sex_dict = titanic['Sex'].value_counts().to_dict()
sex_dict

{'male': 577, 'female': 314}

In [198]:
titanic['sex_count'] = titanic['Sex'].map(sex_dict)

In [199]:
titanic.head()

Unnamed: 0,Sex,sex_count
0,male,577
1,female,314
2,female,314
3,female,314
4,male,577


##### Advantages
1. Easy To Use
2. Not increasing feature space
##### Disadvantages
1. It will provide same weight if the frequencies are same

## Target Guided Ordinal Encoding

The steps for performing Target Guided Ordinal Encoding are as follows:

1. Calculate the mean of the target variable for each category of the categorical variable.
2. Sort the categories based on their mean target value.
3. Assign a numerical value to each category based on its position in the sorted list.

In [216]:
titanic_target = pd.read_csv("D:\\complete machine learning course\\titanic\\train.csv",usecols = ['Cabin','Survived'])
titanic_target.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [217]:
titanic_target.Cabin.fillna('Missing',inplace = True)
titanic_target.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [233]:
titanic_target.Cabin= titanic_target.Cabin.str[0]

In [238]:
titanic_target.Cabin.value_counts()

M    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64

In [240]:
titanic_target.groupby('Cabin').mean()

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [246]:
target = titanic_target.groupby('Cabin')['Survived'].mean().sort_values().index
target

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [251]:
enumerate(target,0)

<enumerate at 0x1e2777efb40>

In [257]:
label_dict = {key:value for value, key in enumerate(target, 0)}
label_dict

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [262]:
titanic_target['Target_encode'] = titanic_target['Cabin'].map(label_dict)

In [264]:
titanic_target

Unnamed: 0,Survived,Cabin,Target_encode
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1
...,...,...,...
886,0,M,1
887,1,B,6
888,0,M,1
889,1,C,4


## Mean Encoding

In [279]:
mean_en = titanic_target.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_en

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [281]:
titanic_target['mean_en'] = titanic_target['Cabin'].map(mean_en)

In [282]:
titanic_target

Unnamed: 0,Survived,Cabin,Target_encode,mean_en
0,0,M,1,0.299854
1,1,C,4,0.593220
2,1,M,1,0.299854
3,1,C,4,0.593220
4,0,M,1,0.299854
...,...,...,...,...
886,0,M,1,0.299854
887,1,B,6,0.744681
888,0,M,1,0.299854
889,1,C,4,0.593220
