# Hello Categorical Feature Encoding

In [1]:
# Prerequisites
import sys
import numpy as np
import pandas as pd
print("Python Version: ", sys.version)
print("Numpy Version: ", np.__version__)
np.set_printoptions(precision=3, suppress=True)

Python Version:  3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]
Numpy Version:  2.2.2


### Perform Label and One-Hot Encoding in a 2 Dimension Numpy Feature Array

In [2]:
import random 

nr_samples = 15

# Generate some random numerical features (8 columns)
X_num_np = np.array([[random.uniform(-4, 4) for _ in range(8)] for _ in range(nr_samples)])

# Generate dummy categorical feature (column index 2) with 5 unique categories
categories = ['A', 'B', 'C', 'D', 'E']
X_categ_np = np.array([random.choice(categories) for _ in range(nr_samples)]).reshape(-1, 1)

# Concatenate numeric and categorical features into a single NumPy array
X = np.hstack((X_num_np[:, :2], X_categ_np, X_num_np[:, 2:]))

# Generate binary classification labels as a NumPy array
y = np.array([random.randint(0, 1) for _ in range(nr_samples)])

print("X shape: ", X.shape)
print("X datatype: ", X.dtype)

# Make to Pandas dataframe for better display
df_X = pd.DataFrame(X)
df_X.head()


X shape:  (15, 9)
X datatype:  <U32


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.8667449532177729,2.4831070053168647,E,-0.8131764969802964,-1.6667789037311334,3.0146037832220136,-0.9850546369852902,-2.266307284365543,-2.3613989150609083
1,3.802316877585179,-0.1304730102905233,A,0.8166382809127439,-2.201343553757871,2.123443555757164,-3.088712174377644,1.1248374768955998,1.629355986507428
2,0.5979459227408137,0.0203386036326156,A,-3.0753227278867463,-0.4617168949897747,-2.867495973529937,2.5869984342731094,0.2333595259013767,-2.5787844348675097
3,1.2755912993707677,-0.8315184764906238,E,3.467061674056156,1.9517183535605265,3.644287473440136,1.650084102361566,-2.166417286789379,-1.510142692889323
4,-0.7863868482231817,-0.1697550430068863,E,3.4669014151253874,-1.972779872175062,2.1163484007248456,-0.5304585749490149,-3.917403407323147,-0.5638175463389388


### Label Encode the Catgorical Column 2

In [4]:
from sklearn.preprocessing import LabelEncoder

X_le = X.copy()

# LabelEncode column 2
le = LabelEncoder()
X_le[:, 2] = le.fit_transform(X_le[:, 2])
print("X shape after label encoding: ", X_le.shape)

# Make to Pandas dataframe for better display
df_X_le = pd.DataFrame(X_le)
df_X_le.head()

X shape after label encoding:  (15, 9)


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.8667449532177729,2.4831070053168647,4,-0.8131764969802964,-1.6667789037311334,3.0146037832220136,-0.9850546369852902,-2.266307284365543,-2.3613989150609083
1,3.802316877585179,-0.1304730102905233,0,0.8166382809127439,-2.201343553757871,2.123443555757164,-3.088712174377644,1.1248374768955998,1.629355986507428
2,0.5979459227408137,0.0203386036326156,0,-3.0753227278867463,-0.4617168949897747,-2.867495973529937,2.5869984342731094,0.2333595259013767,-2.5787844348675097
3,1.2755912993707677,-0.8315184764906238,4,3.467061674056156,1.9517183535605265,3.644287473440136,1.650084102361566,-2.166417286789379,-1.510142692889323
4,-0.7863868482231817,-0.1697550430068863,4,3.4669014151253874,-1.972779872175062,2.1163484007248456,-0.5304585749490149,-3.917403407323147,-0.5638175463389388


Display mapping

In [6]:
mapping = dict(zip(le.classes_, range(len(le.classes_))))

# Display the mapping
print("Original Value -> Encoded Values")
for original, encoded in mapping.items():
    print(f"{original} -> {encoded}")

Original Value -> Encoded Values
A -> 0
B -> 1
C -> 2
D -> 3
E -> 4


### One-hot Encode

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# One-hot Encode column 2
 # Apply to column index 2, keep other columns unchanged
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')  
X_ohe = np.array(ct.fit_transform(X.copy()))
print("X shape after one-hot encoding: ", X_ohe.shape)

# Make to Pandas dataframe for better display
df_X_ohe = pd.DataFrame(X_ohe)
df_X_ohe.head()

X shape after one-hot encoding:  (15, 13)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,0.0,0.0,0.0,1.0,0.8667449532177729,2.4831070053168647,-0.8131764969802964,-1.6667789037311334,3.0146037832220136,-0.9850546369852902,-2.266307284365543,-2.3613989150609083
1,1.0,0.0,0.0,0.0,0.0,3.802316877585179,-0.1304730102905233,0.8166382809127439,-2.201343553757871,2.123443555757164,-3.088712174377644,1.1248374768955998,1.629355986507428
2,1.0,0.0,0.0,0.0,0.0,0.5979459227408137,0.0203386036326156,-3.0753227278867463,-0.4617168949897747,-2.867495973529937,2.5869984342731094,0.2333595259013767,-2.5787844348675097
3,0.0,0.0,0.0,0.0,1.0,1.2755912993707677,-0.8315184764906238,3.467061674056156,1.9517183535605265,3.644287473440136,1.650084102361566,-2.166417286789379,-1.510142692889323
4,0.0,0.0,0.0,0.0,1.0,-0.7863868482231817,-0.1697550430068863,3.4669014151253874,-1.972779872175062,2.1163484007248456,-0.5304585749490149,-3.917403407323147,-0.5638175463389388
