In [1]:
import pandas as pd
df = pd.read_csv("pre_encoded_lendingclub.csv")
df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,total_il_high_credit_limit,hardship_flag,disbursement_method,debt_settlement_flag,loan_default,log_income,fico_avg,issue_year,issue_month,issue_d
0,3600.0,3600.0,3600.0,36,13.99,123.03,C,C4,10,MORTGAGE,...,13734.0,N,Cash,N,0,10.915107,677.0,2015.0,12.0,2015-12-01
1,24700.0,24700.0,24700.0,36,11.99,820.28,C,C1,10,MORTGAGE,...,24667.0,N,Cash,N,0,11.082158,717.0,2015.0,12.0,2015-12-01
2,20000.0,20000.0,20000.0,60,10.78,432.66,B,B4,10,MORTGAGE,...,14877.0,N,Cash,N,0,11.050906,697.0,2015.0,12.0,2015-12-01
3,10400.0,10400.0,10400.0,60,22.45,289.91,F,F1,3,MORTGAGE,...,88097.0,N,Cash,N,0,11.556311,697.0,2015.0,12.0,2015-12-01
4,11950.0,11950.0,11950.0,36,13.44,405.18,C,C3,4,RENT,...,4000.0,N,Cash,N,0,10.434145,692.0,2015.0,12.0,2015-12-01


# ENCODING 
grade-> Ordinal (A → G)->Label encode;  sub_grade-> Ordinal (A1 → G5)-> Label encode or One-hot;  home_ownership-> Nominal-> One-hot encode;   purpose-> Nominal-> One-hot encode;   verification_status-> Nominal-> One-hot encode

### Label Encoding grade & sub_grade

In [2]:
from sklearn.preprocessing import LabelEncoder

# Encode grade (A–G)
grade_order = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
df['grade'] = df['grade'].astype(pd.CategoricalDtype(categories=grade_order, ordered=True))
df['grade_encoded'] = df['grade'].cat.codes

# Encode sub_grade (A1–G5)
sub_grade_order = [f"{g}{n}" for g in grade_order for n in range(1,6)]
df['sub_grade'] = df['sub_grade'].astype(pd.CategoricalDtype(categories=sub_grade_order, ordered=True))
df['sub_grade_encoded'] = df['sub_grade'].cat.codes


### One-hot Encode Nominal Categorical Columns

In [3]:
df = pd.get_dummies(df, columns=['home_ownership', 'purpose', 'verification_status'], drop_first=True)


### Drop Unused Original Categorical Columns  

In [4]:
df.drop(columns=['grade', 'sub_grade'], inplace=True)


In [5]:
df[['grade_encoded', 'sub_grade_encoded']].head()
df.filter(like='home_ownership_').head(1)
df.filter(like='purpose_').head(1)
df.filter(like='verification_status_').head(1)


Unnamed: 0,verification_status_Source Verified,verification_status_Verified
0,0,0


In [6]:
# See all dummy columns created
print([col for col in df.columns if 'verification_status_' in col])
print([col for col in df.columns if 'home_ownership_' in col])
print([col for col in df.columns if 'purpose_' in col])


['verification_status_Source Verified', 'verification_status_Verified']
['home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT']
['purpose_credit_card', 'purpose_debt_consolidation', 'purpose_home_improvement', 'purpose_house', 'purpose_major_purchase', 'purpose_medical', 'purpose_moving', 'purpose_other', 'purpose_small_business', 'purpose_vacation']


In [7]:
# See how many rows belong to each one-hot encoded column
print("Verification Status Dummies:\n", df.filter(like='verification_status_').sum())
print("\nHome Ownership Dummies:\n", df.filter(like='home_ownership_').sum())
print("\nPurpose Dummies:\n", df.filter(like='purpose_').sum())


Verification Status Dummies:
 verification_status_Source Verified    521273
verification_status_Verified           418336
dtype: int64

Home Ownership Dummies:
 home_ownership_OTHER       478
home_ownership_OWN      144832
home_ownership_RENT     534421
dtype: int64

Purpose Dummies:
 purpose_credit_card           295279
purpose_debt_consolidation    780321
purpose_home_improvement       87504
purpose_house                   7253
purpose_major_purchase         29425
purpose_medical                15554
purpose_moving                  9480
purpose_other                  81428
purpose_small_business         15416
purpose_vacation                9065
dtype: int64


### With all dummies (for XGBoost, RandomForest)

✅ For tree-based models (XGBoost, Random Forest, LightGBM). They don’t care about multicollinearity and benefit from all dummy variables.

In [9]:
# Save the currently encoded version as-is
df.to_csv("lendingclub_full_encoded.csv", index=False)
