# Encoding

In [1]:
import pandas as pd
import numpy as np
import math


In [2]:
df = pd.read_csv("C:\\Users\\Kashish\\Downloads\\encoding_practice.csv")

In [3]:
# Add a simple numeric target column
# (Target Encoding needs a number column)
# ------------------------------------------------------------

# Base spend value for each category
base_spend = {
    "Grocery": 1200,
    "Clothing": 2500,
    "Electronics": 6000,
    "Furniture": 8000
}

In [4]:
# Create a simple target value (Monthly_Spend)
# We add a small extra amount using ID so values are not same
df["Monthly_Spend"] = df.apply(
    lambda row: base_spend[row["Purchase_Category"]] + (row["ID"] % 7) * 50,
    axis=1
)

In [5]:
# List of all categorical columns
cat_cols = ["Gender", "City", "Education", "Marital_Status", "Purchase_Category"]

df.head()

Unnamed: 0,ID,Gender,City,Education,Marital_Status,Purchase_Category,Monthly_Spend
0,1,Male,Delhi,Graduate,Single,Electronics,6050
1,2,Female,Mumbai,Postgraduate,Married,Clothing,2600
2,3,Male,Chennai,HighSchool,Single,Grocery,1350
3,4,Other,Pune,Graduate,Single,Electronics,6200
4,5,Female,Bangalore,HighSchool,Married,Clothing,2750


In [7]:
# LABEL ENCODING
# Definition:
# Converts each category into an integer like 0,1,2...
# Good for simple models, but numbers imply order (problem).
# ------------------------------------------------------------

df_le = df.copy()

for col in cat_cols:
    codes, uniques = pd.factorize(df_le[col])     # convert text → number
    df_le[col + "_LE"] = codes

df_le.to_csv("label_encoded.csv", index=False)
df_le.head()


Unnamed: 0,ID,Gender,City,Education,Marital_Status,Purchase_Category,Monthly_Spend,Gender_LE,City_LE,Education_LE,Marital_Status_LE,Purchase_Category_LE
0,1,Male,Delhi,Graduate,Single,Electronics,6050,0,0,0,0,0
1,2,Female,Mumbai,Postgraduate,Married,Clothing,2600,1,1,1,1,1
2,3,Male,Chennai,HighSchool,Single,Grocery,1350,0,2,2,0,2
3,4,Other,Pune,Graduate,Single,Electronics,6200,2,3,0,0,0
4,5,Female,Bangalore,HighSchool,Married,Clothing,2750,1,4,2,1,1


In [8]:
# ONE-HOT ENCODING
# Definition:
# Creates new columns like Gender_Male, Gender_Female with 0/1.
# No fake ordering but increases number of columns.
# ------------------------------------------------------------

df_ohe = pd.get_dummies(df, columns=cat_cols, prefix=cat_cols, drop_first=False)

df_ohe.to_csv("one_hot_encoded.csv", index=False)
df_ohe.head()


Unnamed: 0,ID,Monthly_Spend,Gender_Female,Gender_Male,Gender_Other,City_Bangalore,City_Chennai,City_Delhi,City_Mumbai,City_Pune,Education_Graduate,Education_HighSchool,Education_Postgraduate,Marital_Status_Married,Marital_Status_Single,Purchase_Category_Clothing,Purchase_Category_Electronics,Purchase_Category_Furniture,Purchase_Category_Grocery
0,1,6050,False,True,False,False,False,True,False,False,True,False,False,False,True,False,True,False,False
1,2,2600,True,False,False,False,False,False,True,False,False,False,True,True,False,True,False,False,False
2,3,1350,False,True,False,False,True,False,False,False,False,True,False,False,True,False,False,False,True
3,4,6200,False,False,True,False,False,False,False,True,True,False,False,False,True,False,True,False,False
4,5,2750,True,False,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False


In [9]:
# ORDINAL ENCODING
# Definition:
# Converts categories into integers but WITH A MEANINGFUL ORDER.
# Example: HighSchool < Graduate < Postgraduate
# ------------------------------------------------------------

ordinal_maps = {
    "Education": {"HighSchool": 0, "Graduate": 1, "Postgraduate": 2},
    "Marital_Status": {"Single": 0, "Married": 1},
    "Gender": {"Other": 0, "Female": 1, "Male": 2},
    "City": {"Delhi": 0, "Mumbai": 1, "Chennai": 2, "Pune": 3, "Bangalore": 4},
    "Purchase_Category": {"Grocery": 0, "Clothing": 1, "Electronics": 2, "Furniture": 3}
}

df_ord = df.copy()

for col, mapping in ordinal_maps.items():
    df_ord[col + "_ORD"] = df_ord[col].map(mapping)

df_ord.to_csv("ordinal_encoded.csv", index=False)
df_ord.head()


Unnamed: 0,ID,Gender,City,Education,Marital_Status,Purchase_Category,Monthly_Spend,Education_ORD,Marital_Status_ORD,Gender_ORD,City_ORD,Purchase_Category_ORD
0,1,Male,Delhi,Graduate,Single,Electronics,6050,1,0,2,0,2
1,2,Female,Mumbai,Postgraduate,Married,Clothing,2600,2,1,1,1,1
2,3,Male,Chennai,HighSchool,Single,Grocery,1350,0,0,2,2,0
3,4,Other,Pune,Graduate,Single,Electronics,6200,1,0,0,3,2
4,5,Female,Bangalore,HighSchool,Married,Clothing,2750,0,1,1,4,1


In [10]:
# BINARY ENCODING (manual beginner-friendly method)
# Definition:
# Step 1: Convert category → integer
# Step 2: Convert integer → binary digits (0/1)
# Useful when many categories exist.
# ------------------------------------------------------------

df_bin = df.copy()

# Factorize: convert categories to integers
codes, unique_vals = pd.factorize(df_bin["City"])
codes = codes.astype(int)

# How many bits needed?
n_unique = len(unique_vals)
n_bits = int(math.ceil(math.log2(n_unique))) if n_unique > 1 else 1

df_bin["City_CODE"] = codes

# Create binary columns
for bit in range(n_bits):
    df_bin[f"City_BIN_{bit}"] = (codes >> bit) & 1   # shift and take bit

df_bin.to_csv("binary_encoded_city.csv", index=False)
df_bin.head()


Unnamed: 0,ID,Gender,City,Education,Marital_Status,Purchase_Category,Monthly_Spend,City_CODE,City_BIN_0,City_BIN_1,City_BIN_2
0,1,Male,Delhi,Graduate,Single,Electronics,6050,0,0,0,0
1,2,Female,Mumbai,Postgraduate,Married,Clothing,2600,1,1,0,0
2,3,Male,Chennai,HighSchool,Single,Grocery,1350,2,0,1,0
3,4,Other,Pune,Graduate,Single,Electronics,6200,3,1,1,0
4,5,Female,Bangalore,HighSchool,Married,Clothing,2750,4,0,0,1


In [11]:
# TARGET ENCODING
# Definition:
# Replace category with MEAN of target value for that category.
# Example: Clothing → average Monthly_Spend for Clothing customers.
# ------------------------------------------------------------

df_te = df.copy()

means = df.groupby("Purchase_Category")["Monthly_Spend"].mean()
df_te["Purchase_Category_TE"] = df_te["Purchase_Category"].map(means)

df_te.to_csv("target_encoded.csv", index=False)
df_te.head()


Unnamed: 0,ID,Gender,City,Education,Marital_Status,Purchase_Category,Monthly_Spend,Purchase_Category_TE
0,1,Male,Delhi,Graduate,Single,Electronics,6050,6139.285714
1,2,Female,Mumbai,Postgraduate,Married,Clothing,2600,2627.272727
2,3,Male,Chennai,HighSchool,Single,Grocery,1350,1353.846154
3,4,Other,Pune,Graduate,Single,Electronics,6200,6139.285714
4,5,Female,Bangalore,HighSchool,Married,Clothing,2750,2627.272727


In [12]:
# FREQUENCY ENCODING
# Definition:
# Replace category with how many times it appears in the column.
# Example: City with 10 rows → value becomes 10.
# ------------------------------------------------------------

df_fe = df.copy()

freq = df["City"].value_counts()
freq_norm = df["City"].value_counts(normalize=True)  # percentage

df_fe["City_FREQ"] = df_fe["City"].map(freq)
df_fe["City_FREQ_NORM"] = df_fe["City"].map(freq_norm)

df_fe.to_csv("frequency_encoded.csv", index=False)
df_fe.head()


Unnamed: 0,ID,Gender,City,Education,Marital_Status,Purchase_Category,Monthly_Spend,City_FREQ,City_FREQ_NORM
0,1,Male,Delhi,Graduate,Single,Electronics,6050,10,0.2
1,2,Female,Mumbai,Postgraduate,Married,Clothing,2600,10,0.2
2,3,Male,Chennai,HighSchool,Single,Grocery,1350,10,0.2
3,4,Other,Pune,Graduate,Single,Electronics,6200,10,0.2
4,5,Female,Bangalore,HighSchool,Married,Clothing,2750,10,0.2


In [15]:
#Label Encoding
df_le.to_csv("label_encoded.csv", index=False)
#One-Hot Encoding
df_ohe.to_csv("one_hot_encoded.csv", index=False)
#Ordinal Encoding
df_ord.to_csv("ordinal_encoded.csv", index=False)
#Binary Encoding
df_bin.to_csv("binary_encoded_city.csv", index=False)
#Target Encoding
df_te.to_csv("target_encoded.csv", index=False)
#Frequency Encoding
df_fe.to_csv("frequency_encoded.csv", index=False)
print("saved")

saved
