# Encoding Experiment

In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

In [2]:
np.random.seed(42)
n = 6000

In [3]:
education_levels = [
    'Preschool', 'Primary', 'Middle', 'HighSchool',
    'Bachelors', 'Masters', 'PhD'
]

job_types = ['Tech', 'Sales', 'Clerical', 'Labor', 'Management']
regions = ['Urban', 'Suburban', 'Rural']

education = np.random.choice(education_levels, n)
job = np.random.choice(job_types, n)
region = np.random.choice(regions, n)
age = np.random.randint(18, 65, n)

edu_map = {
    'Preschool': 0,
    'Primary': 1,
    'Middle': 2,
    'HighSchool': 3,
    'Bachelors': 5,
    'Masters': 7,
    'PhD': 9
}
income_score = (
    age * 0.03 +
    np.array([edu_map[e] for e in education]) +
    np.random.normal(0, 1.5, n)
)

income = (income_score > np.percentile(income_score, 60)).astype(int)

df = pd.DataFrame({
    'age': age,
    'education': education,
    'job_type': job,
    'region': region,
    'income': income
})

df.head()

Unnamed: 0,age,education,job_type,region,income
0,58,PhD,Clerical,Suburban,1
1,21,HighSchool,Labor,Urban,0
2,46,Bachelors,Management,Urban,1
3,23,PhD,Clerical,Rural,1
4,18,Middle,Sales,Urban,0


# Train Test Split

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = df.drop('income', axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4500, 4), (1500, 4), (4500,), (1500,))

# EXPERIMENT 1: LabelEncoder on ALL Categorical Features

In [8]:
X_le = X.copy()

In [9]:
for col in ['education', 'job_type', 'region']:
    le = LabelEncoder()
    X_le[col] = le.fit_transform(X_le[col])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_le, y, test_size=0.25, random_state=42, stratify=y
)

In [11]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)

In [13]:
print("LabelEncoder everywhere")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

LabelEncoder everywhere
Accuracy: 0.624
F1: 0.4719101123595505


# EXPERIMENT 2: OneHotEncoder for ALL Categoricals (No Ordinal Info)

In [15]:
preprocessor_ohe = ColumnTransformer([
    ('num', 'passthrough', ['age']),
    ('cat', OneHotEncoder(), ['education', 'job_type', 'region'])
])

In [16]:
pipeline_ohe = Pipeline([
    ('prep', preprocessor_ohe),
    ('model', LogisticRegression(max_iter=2000))
])

In [17]:
pipeline_ohe.fit(X_train, y_train)
y_pred = pipeline_ohe.predict(X_test)

In [18]:
print("OneHotEncoder everywhere")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

OneHotEncoder everywhere
Accuracy: 0.898
F1: 0.8717518860016765


# EXPERIMENT 3: Correct Encoding (Ordinal + OneHot)

In [20]:
education_order = [
    'Preschool', 'Primary', 'Middle',
    'HighSchool', 'Bachelors', 'Masters', 'PhD'
]

In [21]:
preprocessor_correct = ColumnTransformer([
    ('num', 'passthrough', ['age']),
    ('edu', OrdinalEncoder(categories=[education_order]), ['education']),
    ('cat', OneHotEncoder(), ['job_type', 'region'])
])

In [22]:
pipeline_correct = Pipeline([
    ('prep', preprocessor_correct),
    ('model', LogisticRegression(max_iter=2000))
])


In [24]:
X = df.drop('income', axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [25]:
pipeline_correct.fit(X_train, y_train)
y_pred = pipeline_correct.predict(X_test)

In [26]:
print("Correct encoding (Ordinal + OneHot)")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))

Correct encoding (Ordinal + OneHot)
Accuracy: 0.9006666666666666
F1: 0.8767576509511993


| Encoding Strategy                   | Accuracy | F1-score |
| ----------------------------------- | -------- | -------- |
| LabelEncoder everywhere             | 0.624    | 0.472    |
| OneHotEncoder everywhere            | 0.898    | 0.872    |
| Correct encoding (Ordinal + OneHot) | 0.901    | 0.877    |
