In [15]:
# Implementation of Encoding Categorical Data - Ordinal Encoding
# Author: Muhammad Humayun Khan

import numpy as np
import pandas as pd

dataset = '/content/drive/MyDrive/datasets/customer.csv'
df = pd.read_csv(dataset)
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
29,83,Female,Average,UG,Yes
19,97,Male,Poor,PG,Yes
8,65,Female,Average,UG,No
46,64,Female,Poor,PG,No
31,22,Female,Poor,School,Yes


In [16]:
# the above dataset having some issues as
# gender is nominal data as same value to each gender
# age is numerical data
# review and education are ordinal as having different conditions or values
# purchased is labelled data
# In order to only use the ordinal encoding, needs to remove the gender and age

df = df.iloc[:,2:]
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [17]:
# import the ordinal encoder algo from preprocessing of the sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:2],df.iloc[:,-1],test_size=0.2)
X_train

Unnamed: 0,review,education
13,Average,School
6,Good,School
35,Poor,School
43,Poor,PG
10,Good,UG
28,Poor,School
15,Poor,UG
9,Good,UG
49,Good,UG
12,Poor,School


In [18]:
# Need to inform the encoder about the categories/value range
# The following Poor is low value followed by higher and same from School and UG, PG as well

oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])
oe.fit(X_train)

In [19]:
# Now transform the values such as 0, 1, 2 etc to the columns values
X_train = oe.transform(X_train)
X_train

array([[1., 0.],
       [2., 0.],
       [0., 0.],
       [0., 2.],
       [2., 1.],
       [0., 0.],
       [0., 1.],
       [2., 1.],
       [2., 1.],
       [0., 0.],
       [2., 0.],
       [0., 2.],
       [1., 2.],
       [0., 1.],
       [2., 2.],
       [0., 1.],
       [1., 1.],
       [0., 2.],
       [0., 2.],
       [2., 2.],
       [2., 0.],
       [1., 2.],
       [1., 1.],
       [2., 2.],
       [1., 0.],
       [0., 2.],
       [0., 2.],
       [1., 0.],
       [1., 1.],
       [1., 1.],
       [2., 1.],
       [1., 1.],
       [0., 2.],
       [0., 0.],
       [2., 0.],
       [2., 0.],
       [1., 0.],
       [0., 1.],
       [0., 2.],
       [0., 2.]])

In [20]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [21]:
X_train

array([[1., 0.],
       [2., 0.],
       [0., 0.],
       [0., 2.],
       [2., 1.],
       [0., 0.],
       [0., 1.],
       [2., 1.],
       [2., 1.],
       [0., 0.],
       [2., 0.],
       [0., 2.],
       [1., 2.],
       [0., 1.],
       [2., 2.],
       [0., 1.],
       [1., 1.],
       [0., 2.],
       [0., 2.],
       [2., 2.],
       [2., 0.],
       [1., 2.],
       [1., 1.],
       [2., 2.],
       [1., 0.],
       [0., 2.],
       [0., 2.],
       [1., 0.],
       [1., 1.],
       [1., 1.],
       [2., 1.],
       [1., 1.],
       [0., 2.],
       [0., 0.],
       [2., 0.],
       [2., 0.],
       [1., 0.],
       [0., 1.],
       [0., 2.],
       [0., 2.]])

In [22]:
# the output is the classification data and needs label encoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()     # create object
le.fit(y_train)         # train with the y data

In [23]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [24]:
# transofrm the y-data to the digits as follow
y_train = le.transform(y_train)
y_test = le.transform(y_test)
y_train

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1])