In [41]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting scikit-learn>=1.6.0 (from category_encoders)
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hInstalling collected packages: scikit-learn, category_encoders
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.5.1
    Uninstalling scikit-learn-1.5.1:
      Successfully uninstalled scikit-learn-1.5.1
Successfully installed category_encoders-2.8.1 scikit-learn-1.6.1


In [43]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt

In [2]:
application_df = pd.read_csv("ApplicationData.csv")

In [3]:
application_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9492 entries, 0 to 9491
Data columns (total 16 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Application_ID                                 9492 non-null   object 
 1   Application_Date                               9492 non-null   object 
 2   Loan_Approval_Status                           9354 non-null   object 
 3   Age_at_Application                             9471 non-null   float64
 4   Marital_Status                                 7712 non-null   object 
 5   Gender                                         9488 non-null   object 
 6   Own_Rent_Home                                  8188 non-null   object 
 7   Occupation_Professional_Category               9361 non-null   object 
 8   Type_of_Business_Industry_of_Employment        7210 non-null   object 
 9   Employment_Status                              9357 

In [4]:
application_df.head()

Unnamed: 0,Application_ID,Application_Date,Loan_Approval_Status,Age_at_Application,Marital_Status,Gender,Own_Rent_Home,Occupation_Professional_Category,Type_of_Business_Industry_of_Employment,Employment_Status,Time_at_Current_Employment_(Months),Time_Employed_with_Previous_Employer_(Months),Number_of_Dependents,Loan_Purpose,Monthly_Income,Government_Employee
0,C302100001,2015-09-15,Approved,39.0,Unmarried,Female,Own,,Other,Full-Time,6.0,,2.0,Home Improvement,1500.59,
1,C302100002,2015-08-31,Approved,29.0,Unmarried,Female,Family,,,Full-Time,25.0,,0.0,Debt Consolidation,3769.36,
2,C302100003,2015-09-21,Approved,24.0,Unmarried,Male,Family,,,Full-Time,7.0,,0.0,Home Improvement,2197.1,
3,C302100004,2015-11-14,Approved,54.0,Unmarried,Male,Rent,,,Full-Time,105.0,,4.0,Debt Consolidation,1094.45,False
4,C302100005,2015-11-18,Approved,30.0,Unmarried,Female,Family,,,Full-Time,60.0,,1.0,Vacation,3798.4,True


In [5]:
application_df.describe()

Unnamed: 0,Age_at_Application,Time_at_Current_Employment_(Months),Time_Employed_with_Previous_Employer_(Months),Number_of_Dependents,Monthly_Income
count,9471.0,9260.0,298.0,7678.0,9492.0
mean,34.817443,90.2054,62.432886,0.905314,30655.75
std,9.703598,303.204377,66.225743,1.086836,2169866.0
min,18.0,1.0,1.0,0.0,0.0
25%,27.0,24.0,18.0,0.0,1562.26
50%,33.0,55.0,38.0,1.0,2474.36
75%,41.0,126.0,79.75,1.0,4061.93
max,69.0,24203.0,419.0,13.0,205626500.0


In [6]:
application_df.duplicated().sum()

0

In [7]:
application_df.isnull().sum() / len(application_df) * 100

Application_ID                                    0.000000
Application_Date                                  0.000000
Loan_Approval_Status                              1.453856
Age_at_Application                                0.221239
Marital_Status                                   18.752634
Gender                                            0.042141
Own_Rent_Home                                    13.737885
Occupation_Professional_Category                  1.380110
Type_of_Business_Industry_of_Employment          24.041298
Employment_Status                                 1.422250
Time_at_Current_Employment_(Months)               2.444164
Time_Employed_with_Previous_Employer_(Months)    96.860514
Number_of_Dependents                             19.110830
Loan_Purpose                                      0.442478
Monthly_Income                                    0.000000
Government_Employee                               1.906869
dtype: float64

# Data Cleaning

In [11]:
application_clean_df = application_df.copy()

In [15]:
# columns with <5% missing
categorical_missing = [
    "Gender",
    "Loan_Purpose",
    "Loan_Approval_Status",
    "Employment_Status",
    "Occupation_Professional_Category",
    "Government_Employee"
]

In [16]:
numerical_missing = ["Age_at_Application"]

# Categorical Imputation with Unknown (<5%)
### We are using Unknown instead of Mode because Unknown works better for modeling

In [17]:
# Impute categorical features with Unknown
application_clean_df[categorical_missing] = application_clean_df[categorical_missing].fillna("Unknown")

# Numerical Imputation with Median(<5%>)
### We are using SimpleImputer so it integrates well into a pipeline later:

In [18]:
# Impute Age_at_Application with median
num_imputer = SimpleImputer(strategy="median")
application_clean_df[['Age_at_Application']] = num_imputer.fit_transform(application_clean_df[['Age_at_Application']])

In [20]:
application_clean_df.isnull().sum() / len(application_df) * 100

Application_ID                                    0.000000
Application_Date                                  0.000000
Loan_Approval_Status                              0.000000
Age_at_Application                                0.000000
Marital_Status                                   18.752634
Gender                                            0.000000
Own_Rent_Home                                    13.737885
Occupation_Professional_Category                  0.000000
Type_of_Business_Industry_of_Employment          24.041298
Employment_Status                                 0.000000
Time_at_Current_Employment_(Months)               2.444164
Time_Employed_with_Previous_Employer_(Months)    96.860514
Number_of_Dependents                             19.110830
Loan_Purpose                                      0.000000
Monthly_Income                                    0.000000
Government_Employee                               0.000000
dtype: float64

# Cleaning Columns with missing values of (5-25%)

### Create Missingness Indicator for Categorical Columns
We create a new column for each moderately missing feature to indicate whether the original value was missing because, if the borrower didn't provide marital status or housing info might be a subtle indicator of risk or profile.

In [21]:
# Create binary flags: 1 if missing, 0 if not
application_clean_df['Marital_Status_Missing'] = application_clean_df['Marital_Status'].isnull().astype(int)
application_clean_df['Own_Rent_Home_Missing'] = application_clean_df['Own_Rent_Home'].isnull().astype(int)
application_clean_df['Type_of_Business_Missing'] = application_clean_df['Type_of_Business_Industry_of_Employment'].isnull().astype(int)
application_clean_df['Number_of_Dependents_Missing'] = application_clean_df['Number_of_Dependents'].isnull().astype(int)

# We’ll impute them with the most common value (mode) or Unknown

In [22]:
# Option 1: Fill with the most frequent value (mode)
application_clean_df['Own_Rent_Home'].fillna(application_clean_df['Own_Rent_Home'].mode()[0], inplace=True)
application_clean_df['Marital_Status'].fillna(application_clean_df['Marital_Status'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  application_clean_df['Own_Rent_Home'].fillna(application_clean_df['Own_Rent_Home'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  application_clean_df['Marital_Status'].fillna(application_clean_df['Marital_Status'].mode()[0], inplace=True)


## We fill "Type_of_Business_Industry_of_Employment" with unknown because business types are diverse, and imputing with the mode could bias the model — keeping “Unknown” lets the model learn if “not disclosing” is meaningful.

In [23]:
application_clean_df['Type_of_Business_Industry_of_Employment'].fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  application_clean_df['Type_of_Business_Industry_of_Employment'].fillna("Unknown", inplace=True)


In [28]:
application_clean_df['Time_at_Current_Employment_(Months)'].fillna(application_clean_df['Time_at_Current_Employment_(Months)'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  application_clean_df['Time_at_Current_Employment_(Months)'].fillna(application_clean_df['Time_at_Current_Employment_(Months)'].median(), inplace=True)


# Impute Numerical Columns with Median.
### For 'Number_of_Dependents', which is numeric, use the median (resistant to outliers).

In [24]:
application_clean_df['Number_of_Dependents'].fillna(application_clean_df['Number_of_Dependents'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  application_clean_df['Number_of_Dependents'].fillna(application_clean_df['Number_of_Dependents'].median(), inplace=True)


# Drop with High missingness(96.86%) 

In [25]:
application_clean_df.drop(columns=['Time_Employed_with_Previous_Employer_(Months)'], inplace=True)

In [29]:
application_clean_df.isnull().sum() / len(application_df) * 100

Application_ID                             0.0
Application_Date                           0.0
Loan_Approval_Status                       0.0
Age_at_Application                         0.0
Marital_Status                             0.0
Gender                                     0.0
Own_Rent_Home                              0.0
Occupation_Professional_Category           0.0
Type_of_Business_Industry_of_Employment    0.0
Employment_Status                          0.0
Time_at_Current_Employment_(Months)        0.0
Number_of_Dependents                       0.0
Loan_Purpose                               0.0
Monthly_Income                             0.0
Government_Employee                        0.0
Marital_Status_Missing                     0.0
Own_Rent_Home_Missing                      0.0
Type_of_Business_Missing                   0.0
Number_of_Dependents_Missing               0.0
dtype: float64

# Handle Outliers and Skewed Features

## Monthly_Income is very skewed with unrealistic max values 
## The Mean is much higher than the median this shows right skew (some incomes are extremely high)

## Time_at_Current_Employment_(Months) → has extreme outliers (like someone working for 2,000 years!)

In [26]:
# Check the outliers
application_clean_df["Monthly_Income"].describe()

count    9.492000e+03
mean     3.065575e+04
std      2.169866e+06
min      0.000000e+00
25%      1.562260e+03
50%      2.474360e+03
75%      4.061930e+03
max      2.056265e+08
Name: Monthly_Income, dtype: float64

# Cap the outliers (99th percentile)
### This is called winsorization — we limit extreme values so they don’t affect the model too much.

In [30]:
# Calculate the 99th percentile value
income_cap = application_clean_df['Monthly_Income'].quantile(0.99)

# Cap everything above that to the 99th percentile
application_clean_df['Monthly_Income_Capped'] = application_clean_df['Monthly_Income'].clip(upper=income_cap)

# Apply log transformation
## This helps shrink big values and spread out smaller ones so the model learns better.

In [32]:
# Add 1 to avoid log(0) error
application_clean_df['Monthly_Income_Log'] = np.log1p(application_clean_df['Monthly_Income_Capped'])

# Handling Time_at_Current_Employment_(Months)

### Someone has been working 24,203 months = 2000 years.

### These values can confuse models

# Set a reasonable upper limit
## Let’s assume 480 months (40 years) is the upper bound for employment, assuming he started working at 20 and retires at 60.

In [33]:
# Cap values above 40 years (480 months)
application_clean_df['Employment_Time_Capped'] = application_clean_df['Time_at_Current_Employment_(Months)'].clip(upper=480)

# Encode Categorical Variables

### We will use One-Hot Encoding for categories that are <= 10, i.e Gender, Marital_Status, Own_Rent_Home
### We will use Target Encoding for categories that are > 10, i.e Occupation_Professional_Category, Type_of_Business

# One-Hot Encoding

In [38]:
# Use pandas get_dummies for simple one-hot encoding
categorical_cols = ['Gender', 'Marital_Status', 'Own_Rent_Home', 'Employment_Status', 'Loan_Purpose']
df_encoded = pd.get_dummies(application_clean_df, columns=categorical_cols, drop_first=True)

# Binary Encoding (Manual for Government_Employee)

In [39]:
df_encoded['Government_Employee'] = application_clean_df['Government_Employee'].map({'Yes': 1, 'No': 0, 'Unknown': 0})

In [None]:
# Target Encoding (for high-cardinality columns)
## This replaces each category with the mean target value (e.g., default rate for that group).

### Step 1: Split data first to avoid data leakage!

#  Feature Engineering

# Employment Stability Score
### Combines how long someone has worked at their current job with whether they are employed or not.
### If Employed → use time worked
### If Unemployed or Unknown → assign lower or 0 score

In [50]:
def compute_employment_stability(row):
    if row['Employment_Status'] in ['Employed', 'Self-employed']:
        return row['Time_at_Current_Employment_(Months)']
    else:
        return 0  # or a small penalty like -1 if you want to distinguish unemployment

application_clean_df['Employment_Stability_Score'] = application_clean_df.apply(compute_employment_stability, axis=1)

# Age Bins Feature
### Categorizing age into buckets helps some models better understand relationships.

In [51]:
def categorize_age(age):
    if age < 25:
        return 'Young'
    elif age < 35:
        return 'Emerging'
    elif age < 50:
        return 'Established'
    else:
        return 'Senior'

application_clean_df['Age_Bin'] = application_clean_df['Age_at_Application'].apply(categorize_age)

# Has Dependents Flag Feature
### Indicates whether the person financially supports others.
### Has_Dependents = 1 if Number_of_Dependents > 0 else 0

In [52]:
application_clean_df['Has_Dependents'] = (application_clean_df['Number_of_Dependents'] > 0).astype(int)

# Convert to datetime

In [54]:
# Convert Application_Date to datetime format
application_clean_df['Application_Date'] = pd.to_datetime(application_clean_df['Application_Date'], errors='coerce')

# Check if any dates failed to convert
invalid_dates = application_clean_df['Application_Date'].isna().sum()
print(f"Number of invalid dates after conversion: {invalid_dates}")

Number of invalid dates after conversion: 1363


In [55]:
# View rows where Application_Date couldn't be parsed
invalid_date_rows = application_clean_df[application_clean_df['Application_Date'].isna()]
print(invalid_date_rows[['Application_ID', 'Application_Date']].head())

    Application_ID Application_Date
115     C302100116              NaT
116     C302100117              NaT
117     C302100118              NaT
118     C302100119              NaT
119     C302100120              NaT


In [56]:
application_clean_df.to_csv("clean_application_data.csv", index = False)

In [57]:
cad = pd.read_csv("clean_application_data.csv")
cad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9492 entries, 0 to 9491
Data columns (total 25 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Application_ID                           9492 non-null   object 
 1   Application_Date                         8129 non-null   object 
 2   Loan_Approval_Status                     9492 non-null   object 
 3   Age_at_Application                       9492 non-null   float64
 4   Marital_Status                           9492 non-null   object 
 5   Gender                                   9492 non-null   object 
 6   Own_Rent_Home                            9492 non-null   object 
 7   Occupation_Professional_Category         9492 non-null   object 
 8   Type_of_Business_Industry_of_Employment  9492 non-null   object 
 9   Employment_Status                        9492 non-null   object 
 10  Time_at_Current_Employment_(Months)      9492 no