In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("Customers.csv")
data.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,high,male,single,no
1,25,high,male,married,no
2,35,high,male,single,yes
3,35,medium,male,single,yes
4,30,low,female,single,yes


In [3]:
data.isnull().sum()

age         0
income      0
gender      0
m_status    0
buys        0
dtype: int64

# Encoding without any encoding modules

In [4]:
data.income

0       high
1       high
2       high
3     medium
4        low
5        low
6        low
7     medium
8        low
9     medium
10    medium
11    medium
12      high
13    medium
14      high
15      high
16    medium
17      high
18    medium
19      high
20    medium
21       low
Name: income, dtype: object

In [5]:
data.income = data.income.replace(["high", "medium", "low"], [3, 2, 1])
data.income

0     3
1     3
2     3
3     2
4     1
5     1
6     1
7     2
8     1
9     2
10    2
11    2
12    3
13    2
14    3
15    3
16    2
17    3
18    2
19    3
20    2
21    1
Name: income, dtype: int64

In [6]:
data["income"] = data.income

In [7]:
data

Unnamed: 0,age,income,gender,m_status,buys
0,25,3,male,single,no
1,25,3,male,married,no
2,35,3,male,single,yes
3,35,2,male,single,yes
4,30,1,female,single,yes
5,32,1,female,single,no
6,22,1,female,married,yes
7,22,2,male,married,no
8,25,1,female,single,yes
9,35,2,female,married,yes


# Label Encoder

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
le = LabelEncoder()

In [10]:
data.gender = le.fit_transform(data.gender)

In [11]:
data

Unnamed: 0,age,income,gender,m_status,buys
0,25,3,1,single,no
1,25,3,1,married,no
2,35,3,1,single,yes
3,35,2,1,single,yes
4,30,1,0,single,yes
5,32,1,0,single,no
6,22,1,0,married,yes
7,22,2,1,married,no
8,25,1,0,single,yes
9,35,2,0,married,yes


# OneHot Encoder

In [12]:
dummy = pd.get_dummies(data.m_status)

In [13]:
dummy

Unnamed: 0,married,single
0,0,1
1,1,0
2,0,1
3,0,1
4,0,1
5,0,1
6,1,0
7,1,0
8,0,1
9,1,0


In [14]:
data.m_status = dummy
data

Unnamed: 0,age,income,gender,m_status,buys
0,25,3,1,0,no
1,25,3,1,1,no
2,35,3,1,0,yes
3,35,2,1,0,yes
4,30,1,0,0,yes
5,32,1,0,0,no
6,22,1,0,1,yes
7,22,2,1,1,no
8,25,1,0,0,yes
9,35,2,0,1,yes


# Ordinal Encoder

In [15]:
from sklearn.preprocessing import OrdinalEncoder

In [16]:
data

Unnamed: 0,age,income,gender,m_status,buys
0,25,3,1,0,no
1,25,3,1,1,no
2,35,3,1,0,yes
3,35,2,1,0,yes
4,30,1,0,0,yes
5,32,1,0,0,no
6,22,1,0,1,yes
7,22,2,1,1,no
8,25,1,0,0,yes
9,35,2,0,1,yes


In [17]:
data.buys.unique()

array(['no', 'yes'], dtype=object)

In [18]:
li = ["no", "yes"]
enc = OrdinalEncoder(categories= [li])

In [19]:
encoded = enc.fit_transform(data[["buys"]])

In [20]:
data.buys = encoded

In [21]:
data

Unnamed: 0,age,income,gender,m_status,buys
0,25,3,1,0,0.0
1,25,3,1,1,0.0
2,35,3,1,0,1.0
3,35,2,1,0,1.0
4,30,1,0,0,1.0
5,32,1,0,0,0.0
6,22,1,0,1,1.0
7,22,2,1,1,0.0
8,25,1,0,0,1.0
9,35,2,0,1,1.0


# Hashing Encoder

In [22]:
data1 = pd.read_csv('agora.csv')
data1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [23]:
data1.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [24]:
data1.Transport = data1.Transport.fillna(data1.Transport.mean())

In [25]:
data1.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [27]:
!pip install category-encoders

Collecting category-encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [29]:
import category_encoders as ce

In [37]:
hash_ = ce.HashingEncoder(cols="Area", n_components = 3)

In [38]:
hash_.fit_transform(data1)

Unnamed: 0,col_0,col_1,col_2,Marketing Spend,Administration,Transport,Profit
0,0,1,0,114523.61,136897.8,471784.1,192261.83
1,0,0,1,162597.7,151377.59,443898.53,191792.06
2,1,0,0,153441.51,101145.55,407934.54,191050.39
3,0,1,0,144372.41,118671.85,383199.62,182901.99
4,1,0,0,142107.34,91391.77,366168.42,166187.94
5,0,1,0,131876.9,99814.71,362861.36,156991.12
6,0,0,1,134615.46,147198.87,127716.82,156122.51
7,1,0,0,130298.13,145530.06,323876.68,155752.6
8,0,1,0,120542.52,148718.95,311613.29,152211.77
9,0,0,1,123334.88,108679.17,304981.62,149759.96
