# **Encoding**

In [1]:
# import library
import pandas as pd

In [2]:
# read data from kaggle
df =pd.read_csv('/kaggle/input/customer-purchase/CustomerPurchase.csv')

In [3]:
# read data from github
df = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Customer%20Purchase.csv')

In [4]:
# display first 5 rows
df.head()

Unnamed: 0,Customer ID,Age,Gender,Education,Review,Purchased
0,1021,30,Female,School,Average,No
1,1022,68,Female,UG,Poor,No
2,1023,70,Female,PG,Good,No
3,1024,72,Female,PG,Good,No
4,1025,16,Female,UG,Average,No


In [5]:
# info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Customer ID  50 non-null     int64 
 1   Age          50 non-null     int64 
 2   Gender       50 non-null     object
 3   Education    50 non-null     object
 4   Review       50 non-null     object
 5   Purchased    50 non-null     object
dtypes: int64(2), object(4)
memory usage: 2.5+ KB


# **Label Encoding**

In [6]:
# define label (y)
y = df['Purchased']

In [7]:
# category count
y.value_counts()

No     26
Yes    24
Name: Purchased, dtype: int64

In [8]:
# label manual encoding 
y.replace({'No':0,'Yes':1}, inplace = True)

In [9]:
# category count
y.value_counts()

0    26
1    24
Name: Purchased, dtype: int64

In [10]:
# define label (y)
y = df['Purchased']

In [11]:
# lablel encoding with sklearn
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y = le.fit_transform(y)

In [12]:
# encoding classes
le.classes_

array([0, 1])

# **Define Features (X)**

In [13]:
# define features (X)
X = df.drop(['Purchased', 'Customer ID'], axis=1)

In [14]:
X.head()

Unnamed: 0,Age,Gender,Education,Review
0,30,Female,School,Average
1,68,Female,UG,Poor
2,70,Female,PG,Good
3,72,Female,PG,Good
4,16,Female,UG,Average


**Nominal Variables : Gender**

**Ordinal Variables : Education and Review**

# **Nominal Features Encoding with Dummy Variable**

In [15]:
pd.get_dummies(X, columns = ['Gender'])

Unnamed: 0,Age,Education,Review,Gender_Female,Gender_Male
0,30,School,Average,1,0
1,68,UG,Poor,1,0
2,70,PG,Good,1,0
3,72,PG,Good,1,0
4,16,UG,Average,1,0
5,31,School,Average,1,0
6,18,School,Good,0,1
7,60,School,Poor,1,0
8,65,UG,Average,1,0
9,74,UG,Good,0,1


In [16]:
X = pd.get_dummies(X, columns = ['Gender'], drop_first=True)

In [17]:
X.head()

Unnamed: 0,Age,Education,Review,Gender_Male
0,30,School,Average,0
1,68,UG,Poor,0
2,70,PG,Good,0
3,72,PG,Good,0
4,16,UG,Average,0


# **Ordinal Feature Encoding**

In [18]:
X['Education'].value_counts()

PG        18
School    16
UG        16
Name: Education, dtype: int64

In [19]:
X.replace({'Education':{'School':0,'UG':1,'PG':2}}, inplace=True)

In [20]:
X['Education'].value_counts()

2    18
0    16
1    16
Name: Education, dtype: int64

In [21]:
X['Review'].value_counts()

Poor       18
Good       18
Average    14
Name: Review, dtype: int64

In [22]:
X.replace({'Review':{'Poor':0,'Average':1,'Good':2}}, inplace=True)

In [23]:
X['Review'].value_counts()

0    18
2    18
1    14
Name: Review, dtype: int64

In [24]:
X.head()

Unnamed: 0,Age,Education,Review,Gender_Male
0,30,0,1,0
1,68,1,0,0
2,70,2,2,0
3,72,2,2,0
4,16,1,1,0


In [25]:
# define features (X)
X = df.drop(['Purchased', 'Customer ID'], axis=1)
X.head()

Unnamed: 0,Age,Gender,Education,Review
0,30,Female,School,Average
1,68,Female,UG,Poor
2,70,Female,PG,Good
3,72,Female,PG,Good
4,16,Female,UG,Average


In [26]:
# one hot encoding
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
X

Unnamed: 0,Age,Gender,Education,Review
0,30,Female,School,Average
1,68,Female,UG,Poor
2,70,Female,PG,Good
3,72,Female,PG,Good
4,16,Female,UG,Average
5,31,Female,School,Average
6,18,Male,School,Good
7,60,Female,School,Poor
8,65,Female,UG,Average
9,74,Male,UG,Good


In [27]:
X = ohe.fit_transform(X[['Gender']])
X.toarray()

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [28]:
X.toarray()

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])