# Encoding Categorical Variables

In [2]:
# import all the tools We need

# Regular EDA and plotting libraries
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

In [3]:
import pickle

# Load DataFrame
with open("../data/alter/df1_Cleaned_after_outliars.pkl", "rb") as f:
    df1 = pickle.load(f)

In [4]:
df_encoded=df1.copy()

In [5]:
df_encoded.head()


Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,0.0,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,75.122549,North,direct,0.0,39.0
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0.0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0.0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0.0,39.0


In [6]:
for col in df_encoded.columns:
    print(f"\nColumn: {col}")
    print(df_encoded[col].unique())



Column: ID
[ 24890  24891  24892 ... 173557 173558 173559]

Column: year
[2019]

Column: loan_limit
['cf' 'Unknown' 'ncf']

Column: Gender
['Sex Not Available' 'Male' 'Joint' 'Female']

Column: approv_in_adv
['nopre' 'pre' 'Unknown']

Column: loan_type
['type1' 'type2' 'type3']

Column: loan_purpose
['p1' 'p4' 'p3' 'p2' 'Unknown']

Column: Credit_Worthiness
['l1' 'l2']

Column: open_credit
['nopc' 'opc']

Column: business_or_commercial
['nob/c' 'b/c']

Column: loan_amount
[116500. 206500. 406500. 456500. 696500. 706500. 346500. 266500. 376500.
 436500. 136500. 466500. 226500.  76500. 356500. 156500. 586500. 306500.
 316500. 336500. 426500. 476500. 196500. 186500. 246500. 216500. 506500.
 656500. 256500. 396500. 166500. 236500. 796500. 416500. 386500. 596500.
 606500.  86500. 286500. 146500. 446500. 636500. 486500. 326500.  56500.
 496500. 106500. 126500. 296500. 176500. 566500. 686500. 556500. 676500.
 366500. 276500. 716500.  66500. 616500.  96500.  26500. 666500. 546500.
 526500. 72

In [7]:
label_encoders = {}   # store encoders for later use
category_mappings = {}  # store category → code mapping

for col in df_encoded.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    
    # Save the encoder + mapping
    label_encoders[col] = le
    category_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# Show mappings
for col, mapping in category_mappings.items():
    print(f"{col}: {mapping}")

loan_limit: {'Unknown': 0, 'cf': 1, 'ncf': 2}
Gender: {'Female': 0, 'Joint': 1, 'Male': 2, 'Sex Not Available': 3}
approv_in_adv: {'Unknown': 0, 'nopre': 1, 'pre': 2}
loan_type: {'type1': 0, 'type2': 1, 'type3': 2}
loan_purpose: {'Unknown': 0, 'p1': 1, 'p2': 2, 'p3': 3, 'p4': 4}
Credit_Worthiness: {'l1': 0, 'l2': 1}
open_credit: {'nopc': 0, 'opc': 1}
business_or_commercial: {'b/c': 0, 'nob/c': 1}
Neg_ammortization: {'Unknown': 0, 'neg_amm': 1, 'not_neg': 2}
interest_only: {'int_only': 0, 'not_int': 1}
lump_sum_payment: {'lpsm': 0, 'not_lpsm': 1}
construction_type: {'mh': 0, 'sb': 1}
occupancy_type: {'ir': 0, 'pr': 1, 'sr': 2}
Secured_by: {'home': 0, 'land': 1}
total_units: {'1U': 0, '2U': 1, '3U': 2, '4U': 3}
credit_type: {'CIB': 0, 'CRIF': 1, 'EQUI': 2, 'EXP': 3}
co-applicant_credit_type: {'CIB': 0, 'EXP': 1}
age: {'25-34': 0, '35-44': 1, '45-54': 2, '55-64': 3, '65-74': 4, '<25': 5, '>74': 6}
submission_of_application: {'not_inst': 0, 'to_inst': 1}
Region: {'North': 0, 'North-East': 

In [8]:
import pickle

# Save DataFrame
with open("../data/alter/df1_encoded.pkl", "wb") as f:
    pickle.dump(df_encoded, f)

In [9]:
df_encoded['Status'].value_counts()

Status
0.0    148470
Name: count, dtype: int64