## ML model for Classification:

In [1]:
import numpy as np
import pandas as pd

In [2]:
## Importing Dataset:
df = pd.read_csv('../Dataset/Loan_default.csv')
print(df.shape)
df.head()

(255347, 18)


Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


## Feature Engineering:

There are No Missing value or outliers in this dataset, so there is no need for imputation and outlier detection

#### Step-1: Encoding Categorical Features

Firstly, we can use binary encoding for binary categorical features

In [3]:
## Encoding Binary Categorical Features
Features = ['HasCoSigner', 'HasDependents', 'HasMortgage']

for feature in Features:
   df[feature] = df[feature].map({'Yes':1, 'No':0})
df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,1,1,Other,1,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,0,0,Other,1,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,1,1,Auto,0,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,0,0,Business,0,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,0,1,Auto,0,0


In [4]:
## Drop useless columns:
df = df.drop(columns=['LoanID'])

For marital status and Loan Purpose, they dont have lot of difference in frequency, ordering is not important and their is more then one class so we have to use one hot encoding

In [5]:
df['LoanPurpose'].value_counts()

LoanPurpose
Business     51298
Home         51286
Education    51005
Other        50914
Auto         50844
Name: count, dtype: int64

In [6]:
df = pd.get_dummies(df,columns=['MaritalStatus','LoanPurpose'],drop_first=True)

Education is an ordinal variable where order matters, so we use ordinal encoding

In [7]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[["High School","Bachelor's","Master's","PhD"]])
df['Education'] = oe.fit_transform(df['Education'].values.reshape(-1,1))

There is some order to employment type, although it may not be significant. We can use Label Encoding for Employment Type

In [8]:
df['EmploymentType'].value_counts()

EmploymentType
Part-time        64161
Unemployed       63824
Self-employed    63706
Full-time        63656
Name: count, dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['EmploymentType'] = le.fit_transform(df['EmploymentType'])
df['EmploymentType'].value_counts()

EmploymentType
1    64161
3    63824
2    63706
0    63656
Name: count, dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 21 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Age                    255347 non-null  int64  
 1   Income                 255347 non-null  int64  
 2   LoanAmount             255347 non-null  int64  
 3   CreditScore            255347 non-null  int64  
 4   MonthsEmployed         255347 non-null  int64  
 5   NumCreditLines         255347 non-null  int64  
 6   InterestRate           255347 non-null  float64
 7   LoanTerm               255347 non-null  int64  
 8   DTIRatio               255347 non-null  float64
 9   Education              255347 non-null  float64
 10  EmploymentType         255347 non-null  int64  
 11  HasMortgage            255347 non-null  int64  
 12  HasDependents          255347 non-null  int64  
 13  HasCoSigner            255347 non-null  int64  
 14  Default                255347 non-nu

#### Step-2: Feature Scaling for Numerical Features:

In [11]:
df.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,...,HasMortgage,HasDependents,HasCoSigner,Default,MaritalStatus_Married,MaritalStatus_Single,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other
0,56,85994,50587,520,80,4,15.23,36,0.44,1.0,...,1,1,1,0,False,False,False,False,False,True
1,69,50432,124440,458,15,1,4.81,60,0.68,2.0,...,0,0,1,0,True,False,False,False,False,True
2,46,84208,129188,451,26,3,21.17,24,0.31,2.0,...,1,1,0,1,False,False,False,False,False,False
3,32,31713,44799,743,0,3,7.07,24,0.23,0.0,...,0,0,0,0,True,False,True,False,False,False
4,60,20437,9139,633,8,4,6.51,48,0.73,1.0,...,0,1,0,0,False,False,False,False,False,False


Age is continous variable present in patches/bins, so we first discretize it.

In [12]:
from sklearn.preprocessing import KBinsDiscretizer

# Number of bins from distribution of age
discretizer = KBinsDiscretizer(n_bins=14, encode='ordinal', strategy='uniform')

df['Age'] = discretizer.fit_transform(df[['Age']])
df['Age'].value_counts()

Age
4.0     19876
10.0    19774
12.0    19728
7.0     19709
3.0     19696
6.0     19639
0.0     19597
9.0     19555
13.0    19492
1.0     19419
5.0     14784
2.0     14742
8.0     14711
11.0    14625
Name: count, dtype: int64

In [13]:
num_features = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'InterestRate', 'LoanTerm']
from sklearn.preprocessing import MinMaxScaler

MinMaxScaler = MinMaxScaler()
df[num_features] = MinMaxScaler.fit_transform(df[num_features])
df.sample(3)

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,...,HasMortgage,HasDependents,HasCoSigner,Default,MaritalStatus_Married,MaritalStatus_Single,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other
138406,0.153846,0.622375,0.112858,0.140255,0.94958,1,0.267391,0.0,0.79,3.0,...,1,1,1,0,False,True,True,False,False,False
205078,0.307692,0.391558,0.171131,0.845173,0.798319,1,0.158696,0.75,0.17,1.0,...,1,0,1,0,False,True,False,False,True,False
205752,0.846154,0.364025,0.758305,0.07286,0.285714,1,0.775652,0.5,0.64,1.0,...,0,0,1,0,False,False,False,False,True,False


In [22]:
## Save Transformed Dataset an another dataset
df = df.astype(float) # Convert all data to float
print(df.shape)
df.to_csv('../Dataset/Loan_default_transformed.csv',index=False)

(255347, 21)


### Feature Engineering Pipeline:
Now we will create a pipeline which will automatically perform feature engineerring in the same way.

> Note: We will still need to drop Loan ID manually

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder, MinMaxScaler,  KBinsDiscretizer, FunctionTransformer

## Defining feature classes
binary_features = ['HasCoSigner', 'HasDependents','HasMortgage']
ordinal_features = ['Education']
nominal_features = ['MaritalStatus','LoanPurpose']
label_encoded_features = ['EmploymentType']
numerical_features = ['Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'InterestRate', 'LoanTerm']

In [16]:
DF = pd.read_csv('../Dataset/Loan_default.csv')
DF = DF.drop(columns=['LoanID'])

# Custom function for binary encoding
def  BinaryEncoder(dataset):
   for col in binary_features:
      dataset[col] = dataset[col].map({'Yes': 1, 'No': 0})
   return dataset

# Custom function for label encoding
def label_encode_columns(dataset):
   return dataset.apply(lambda col: LabelEncoder().fit_transform(col) if col.dtype == 'O' else col)

# Function Transformer for binary encoding
FT_binary = FunctionTransformer(BinaryEncoder, validate=False, feature_names_out='one-to-one')
FT_label = FunctionTransformer(label_encode_columns, validate=False, feature_names_out='one-to-one')

In [17]:
## Custom transformer for Age as it needs both discretization and scaling
def bin_and_scale_age(dataset):
   if dataset.ndim == 1: # Reshape into 2D
      dataset = dataset.reshape(-1, 1)
   
   # Step-1: Discretization
   kbins = KBinsDiscretizer(n_bins=14, encode='ordinal', strategy='uniform')
   dataset = kbins.fit_transform(dataset)
   
   # Step-2: Scale the binned values
   scaler = MinMaxScaler()
   dataset = scaler.fit_transform(dataset)
   
   return dataset

# Wrap the function in FunctionTransformer
FT_age = FunctionTransformer(bin_and_scale_age, validate=True)

In [18]:
## Defining Column Transformer
Ct = ColumnTransformer(transformers=[
   ('binary', FT_binary, binary_features),
   ('ordinal', OrdinalEncoder(categories=[["High School","Bachelor's","Master's","PhD"]]), ordinal_features),
   ('Nominal->OHE', OneHotEncoder(drop='first'), nominal_features),
   ('Label->LE', FT_label, label_encoded_features),
   ('Age Transformer', FT_age, ['Age']),
   ('Numerical Scaling-> MinMax', MinMaxScaler(), numerical_features),
]  ,remainder='passthrough',
   force_int_remainder_cols = False, # This ensures column names remain correctly
)
Ct

In [19]:
## Create Pipeline
pipe = Pipeline(steps=[
   ('Preprocessor',Ct)
])

# Apply transformation
df_processed = pipe.fit_transform(DF)
display(df_processed,df_processed.shape)

array([[1.  , 1.  , 1.  , ..., 4.  , 0.44, 0.  ],
       [1.  , 0.  , 0.  , ..., 1.  , 0.68, 0.  ],
       [0.  , 1.  , 1.  , ..., 3.  , 0.31, 1.  ],
       ...,
       [1.  , 1.  , 1.  , ..., 3.  , 0.5 , 0.  ],
       [0.  , 1.  , 1.  , ..., 1.  , 0.44, 0.  ],
       [1.  , 0.  , 1.  , ..., 2.  , 0.48, 0.  ]])

(255347, 21)

In [20]:
def get_transformed_column_names(ct, original_features):
   """Extract transformed column names from ColumnTransformer."""
   output_features = []
   
   for name, transformer, features in ct.transformers_:
      if transformer == 'passthrough':
         output_features.extend(features)
      elif transformer == 'drop':
         continue
      elif hasattr(transformer, "get_feature_names_out"):
         output_features.extend(transformer.get_feature_names_out(features))
      else:
         # Handle FunctionTransformer case
         output_features.extend([f"{name}_{i}" for i in range(len(features))])
   
   return output_features

# Extract correct feature names
transformed_column_names = get_transformed_column_names(pipe.named_steps["Preprocessor"], DF.columns)

# Convert transformed data back to DataFrame
df_processed = pd.DataFrame(df_processed, columns=transformed_column_names)
df_processed.head(3)

Unnamed: 0,HasCoSigner,HasDependents,HasMortgage,Education,MaritalStatus_Married,MaritalStatus_Single,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,...,Age Transformer_0,Income,LoanAmount,CreditScore,MonthsEmployed,InterestRate,LoanTerm,NumCreditLines,DTIRatio,Default
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.769231,0.525885,0.18607,0.400729,0.672269,0.575217,0.5,4.0,0.44,0.0
1,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.262461,0.487512,0.287796,0.12605,0.122174,1.0,1.0,0.68,0.0
2,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.538462,0.512656,0.506892,0.275046,0.218487,0.833478,0.25,3.0,0.31,1.0


In [21]:
# ## Rename correctly:
# names = {'binary_0':'HasCoSigner', "binary_1":'HasDependents', "binary_2":'HasMortgage',
#    'Label->LE_0':'EmploymentType', "Age Transformer_0":'Age',}
# df_processed = df_processed.rename(columns=names)
# df_processed.head(3)