# Encoding
## Converting discrete categorical variable to discrete numerical variable
## There are two types of categorical variables 
##   1-Nominal(eg. item type)
##   2-Ordinal(eg. outlet_size(small,medium,high)

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os

In [3]:
os.getcwd()

'C:\\Users\\KIRTI RAJ PRADHAN'

In [4]:
os.chdir("C:\\Users\\KIRTI RAJ PRADHAN\\Downloads")

In [5]:
df=pd.read_csv("homeprices.csv")

In [6]:
df

Unnamed: 0,town,area,price
0,Chennai,2600,5500000
1,Chennai,3000,5650000
2,Chennai,3200,6100000
3,Chennai,3600,6800000
4,Bangalore,2600,5850000
5,Bangalore,2800,6150000
6,Bangalore,3300,6500000
7,Bangalore,3600,7100000
8,Hyderabad,2600,5750000
9,Hyderabad,2900,6000000


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   town    12 non-null     object
 1   area    12 non-null     int64 
 2   price   12 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 416.0+ bytes


In [8]:
df["town"].value_counts()

Chennai      4
Bangalore    4
Hyderabad    4
Name: town, dtype: int64

## To convert nominal categorical into numeric
## Option 1: get_dummies() using pandas

In [9]:
df_du=pd.get_dummies(df)
df_du

Unnamed: 0,area,price,town_Bangalore,town_Chennai,town_Hyderabad
0,2600,5500000,0,1,0
1,3000,5650000,0,1,0
2,3200,6100000,0,1,0
3,3600,6800000,0,1,0
4,2600,5850000,1,0,0
5,2800,6150000,1,0,0
6,3300,6500000,1,0,0
7,3600,7100000,1,0,0
8,2600,5750000,0,0,1
9,2900,6000000,0,0,1


In [10]:
df_dum=pd.get_dummies(df,drop_first=True)
df_dum

Unnamed: 0,area,price,town_Chennai,town_Hyderabad
0,2600,5500000,1,0
1,3000,5650000,1,0
2,3200,6100000,1,0
3,3600,6800000,1,0
4,2600,5850000,0,0
5,2800,6150000,0,0
6,3300,6500000,0,0
7,3600,7100000,0,0
8,2600,5750000,0,1
9,2900,6000000,0,1


## Option 2: OneHotEncoding using sklearn

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
enc=OneHotEncoder(drop='first')
enc_df=pd.DataFrame(enc.fit_transform(df[['town']]).toarray(),columns=['Chennai','Hyderabad'])
df_ohe=pd.concat([df,enc_df],axis='columns')
df_ohe.drop('town',axis='columns',inplace=True)
df_ohe

Unnamed: 0,area,price,Chennai,Hyderabad
0,2600,5500000,1.0,0.0
1,3000,5650000,1.0,0.0
2,3200,6100000,1.0,0.0
3,3600,6800000,1.0,0.0
4,2600,5850000,0.0,0.0
5,2800,6150000,0.0,0.0
6,3300,6500000,0.0,0.0
7,3600,7100000,0.0,0.0
8,2600,5750000,0.0,1.0
9,2900,6000000,0.0,1.0


In [13]:
df1=pd.DataFrame({'size':['small','medium','high']})
df1

Unnamed: 0,size
0,small
1,medium
2,high


In [14]:
df1['size'].value_counts()

small     1
medium    1
high      1
Name: size, dtype: int64

### Label Encoding
***In Label encoding, each category is assigned a value from 1 to N, where N is the number of categories of that feature.***

***It converts to numeric as per alphabetical order.***

In [15]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df1['size_le_enc']=le.fit_transform(df1['size'])
df1

Unnamed: 0,size,size_le_enc
0,small,2
1,medium,1
2,high,0


### Ordinal Encoding
***convert to numeric as per given order in the function (ascending order)***

In [16]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['small','medium','high']])
df1['size_ord_enc']=oe.fit_transform(df1[['size']])
df1

Unnamed: 0,size,size_le_enc,size_ord_enc
0,small,2,0.0
1,medium,1,1.0
2,high,0,2.0


### Feature Mapping
**convert to numeric, by mapping each category to a value***

In [17]:
df1['size_fm_pan'] = df1['size'].map({'small':0,'medium':1,'high':2})
df1

Unnamed: 0,size,size_le_enc,size_ord_enc,size_fm_pan
0,small,2,0.0,0
1,medium,1,1.0,1
2,high,0,2.0,2


In [18]:
df=pd.DataFrame({'town':['chennai','Bangalore','Hyderabad']})
df

Unnamed: 0,town
0,chennai
1,Bangalore
2,Hyderabad


In [19]:
df['town'].value_counts()

chennai      1
Bangalore    1
Hyderabad    1
Name: town, dtype: int64

In [21]:
### OneHotEncoding
from sklearn.preprocessing import OneHotEncoder
enc=OneHotEncoder(drop='first')
enc_df=pd.DataFrame(enc.fit_transform(df[['town']]).toarray(),columns=['Chennai','Hyderabad'])
df_ohe=pd.concat([df,enc_df],axis='columns')
df_ohe

Unnamed: 0,town,Chennai,Hyderabad
0,chennai,0.0,1.0
1,Bangalore,0.0,0.0
2,Hyderabad,1.0,0.0


In [22]:
## Dummy Encoding
dum = pd.get_dummies(df['town'],drop_first=True)
df_dum = pd.concat([df,dum],axis='columns')
df_dum

Unnamed: 0,town,Hyderabad,chennai
0,chennai,0,1
1,Bangalore,0,0
2,Hyderabad,1,0
