In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

**Ordinal Data** is a type of categorical data that has a clear order or ranking between its values, but the intervals between the values are not necessarily equal or meaningful. It is commonly used in situations where the order of the categories matters but the differences between them are not quantified.

**Pain Scale**: "No Pain", "Mild Pain", "Moderate Pain", "Severe Pain".

This Transformation should apply on input columns.

**Label Encoding** - encode target tables with value between 0 and n_classes

This Transformation should be used to encode target values, i.e `y` and not the input `X`

In [2]:
# Seed for reproducibility
np.random.seed(42)

# Generate ordinal column 1
ordinal_col1 = np.random.choice(['Low', 'Medium', 'High'], size=50, p=[0.4, 0.4, 0.2])

# Generate ordinal column 2
ordinal_col2 = np.random.choice(['Poor', 'Average', 'Good'], size=50, p=[0.3, 0.5, 0.2])

# Generate target column (yes/no)
target_column = np.random.choice(['yes', 'no'], size=50, p=[0.5, 0.5])

# Create the DataFrame
df = pd.DataFrame({
    'Ordinal_Col1': ordinal_col1,
    'Ordinal_Col2': ordinal_col2,
    'Target': target_column
})


In [4]:
df.head()

Unnamed: 0,Ordinal_Col1,Ordinal_Col2,Target
0,Low,Good,yes
1,High,Average,no
2,Medium,Good,yes
3,Medium,Good,no
4,Low,Average,no


In [5]:
# perform ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('Target', axis=1), df['Target'], test_size=0.2, random_state=42)

In [8]:
X_train

Unnamed: 0,Ordinal_Col1,Ordinal_Col2
12,High,Good
4,Low,Average
37,Low,Average
8,Medium,Poor
3,Medium,Good
6,Low,Poor
41,Medium,Average
46,Low,Average
47,Medium,Average
15,Low,Average


In [6]:
from sklearn.preprocessing import OrdinalEncoder


oe = OrdinalEncoder(categories=[['Low', 'Medium', 'High'], ['Poor', 'Average', 'Good']])
X_train_encoded = oe.fit_transform(X_train)
X_test_encoded = oe.transform(X_test)

In [7]:
X_train_encoded


array([[2., 2.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 2.],
       [0., 0.],
       [1., 1.],
       [0., 1.],
       [1., 1.],
       [0., 1.],
       [1., 1.],
       [0., 0.],
       [1., 1.],
       [2., 1.],
       [0., 1.],
       [0., 2.],
       [0., 1.],
       [1., 0.],
       [2., 0.],
       [0., 2.],
       [0., 0.],
       [2., 0.],
       [0., 1.],
       [2., 1.],
       [0., 0.],
       [1., 2.],
       [2., 1.],
       [2., 1.],
       [0., 2.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [1., 1.],
       [2., 0.],
       [0., 1.],
       [0., 0.],
       [1., 1.],
       [1., 2.]])

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)


In [12]:
y_train_encoded

array([0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1])

In [13]:
y_train

Unnamed: 0,Target
12,no
4,no
37,no
8,yes
3,no
6,yes
41,yes
46,no
47,no
15,no
