## ML model for Classification:

In [None]:
import numpy as np
import pandas as pd

In [None]:
## Importing Dataset:
df = pd.read_csv('../Dataset/Loan_default.csv')
print(df.shape)
df.head()

## Feature Engineering:

There are No Missing value or outliers in this dataset, so there is no need for imputation and outlier detection

#### Step-1: Encoding Categorical Features

Firstly, we can use binary encoding for binary categorical features

In [None]:
## Encoding Binary Categorical Features
Features = ['HasCoSigner', 'HasDependents', 'HasMortgage']

for feature in Features:
   df[feature] = df[feature].map({'Yes':1, 'No':0})
df.head()

In [None]:
## Drop useless columns:
df = df.drop(columns=['LoanID'])

For marital status and Loan Purpose, they dont have lot of difference in frequency, ordering is not important and their is more then one class so we have to use one hot encoding

In [None]:
df['LoanPurpose'].value_counts()

In [None]:
df = pd.get_dummies(df,columns=['MaritalStatus','LoanPurpose'],drop_first=True)

Education is an ordinal variable where order matters, so we use ordinal encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[["High School","Bachelor's","Master's","PhD"]])
df['Education'] = oe.fit_transform(df['Education'].values.reshape(-1,1))

There is some order to employment type, although it may not be significant. We can use Label Encoding for Employment Type

In [None]:
df['EmploymentType'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['EmploymentType'] = le.fit_transform(df['EmploymentType'])
df['EmploymentType'].value_counts()

In [None]:
df.info()

#### Step-2: Feature Scaling for Numerical Features:

In [None]:
df.head()

Age is continous variable present in patches/bins, so we first discretize it.

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

# Number of bins from distribution of age
discretizer = KBinsDiscretizer(n_bins=14, encode='ordinal', strategy='uniform')

df['Age'] = discretizer.fit_transform(df[['Age']])
df['Age'].value_counts()

In [None]:
num_features = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'InterestRate', 'LoanTerm']
from sklearn.preprocessing import MinMaxScaler

MinMaxScaler = MinMaxScaler()
df[num_features] = MinMaxScaler.fit_transform(df[num_features])
df.sample(3)

In [None]:
## Save Transformed Dataset an another dataset
df = df.astype(float) # Convert all data to float
# df.to_csv('../Dataset/Loan_default_transformed.csv',index=False)

### Feature Engineering Pipeline:
Now we will create a pipeline which will automatically perform feature engineerring in the same way.

> Note: We will still need to drop Loan ID manually

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder, MinMaxScaler,  KBinsDiscretizer, FunctionTransformer

## Defining feature classes
binary_features = ['HasCoSigner', 'HasDependents','HasMortgage']
ordinal_features = ['Education']
nominal_features = ['MaritalStatus','LoanPurpose']
label_encoded_features = ['EmploymentType']
numerical_features = ['Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'InterestRate', 'LoanTerm']

In [None]:
DF = pd.read_csv('../Dataset/Loan_default.csv')
DF = DF.drop(columns=['LoanID'])

# Custom function for binary encoding
def  BinaryEncoder(dataset):
   for col in binary_features:
      dataset[col] = dataset[col].map({'Yes': 1, 'No': 0})
   return dataset

# Custom function for label encoding
def label_encode_columns(dataset):
   return dataset.apply(lambda col: LabelEncoder().fit_transform(col) if col.dtype == 'O' else col)

# Function Transformer for binary encoding
FT_binary = FunctionTransformer(BinaryEncoder, validate=False, feature_names_out='one-to-one')
FT_label = FunctionTransformer(label_encode_columns, validate=False, feature_names_out='one-to-one')

In [None]:
## Custom transformer for Age as it needs both discretization and scaling
def bin_and_scale_age(dataset):
   if dataset.ndim == 1: # Reshape into 2D
      dataset = dataset.reshape(-1, 1)
   
   # Step-1: Discretization
   kbins = KBinsDiscretizer(n_bins=14, encode='ordinal', strategy='uniform')
   dataset = kbins.fit_transform(dataset)
   
   # Step-2: Scale the binned values
   scaler = MinMaxScaler()
   dataset = scaler.fit_transform(dataset)
   
   return dataset

# Wrap the function in FunctionTransformer
FT_age = FunctionTransformer(bin_and_scale_age, validate=True)

In [None]:
## Defining Column Transformer
Ct = ColumnTransformer(transformers=[
   ('binary', FT_binary, binary_features),
   ('ordinal', OrdinalEncoder(categories=[["High School","Bachelor's","Master's","PhD"]]), ordinal_features),
   ('Nominal->OHE', OneHotEncoder(drop='first'), nominal_features),
   ('Label->LE', FT_label, label_encoded_features),
   ('Age Transformer', FT_age, ['Age']),
   ('Numerical Scaling-> MinMax', MinMaxScaler(), numerical_features),
]  ,remainder='passthrough',
   force_int_remainder_cols = False, # This ensures column names remain correctly
)
Ct

In [None]:
## Create Pipeline
pipe = Pipeline(steps=[
   ('Preprocessor',Ct)
])

# Apply transformation
df_processed = pipe.fit_transform(DF)
display(df_processed,df_processed.shape)

In [None]:
def get_transformed_column_names(ct, original_features):
   """Extract transformed column names from ColumnTransformer."""
   output_features = []
   
   for name, transformer, features in ct.transformers_:
      if transformer == 'passthrough':
         output_features.extend(features)
      elif transformer == 'drop':
         continue
      elif hasattr(transformer, "get_feature_names_out"):
         output_features.extend(transformer.get_feature_names_out(features))
      else:
         # Handle FunctionTransformer case
         output_features.extend([f"{name}_{i}" for i in range(len(features))])
   
   return output_features

# Extract correct feature names
transformed_column_names = get_transformed_column_names(pipe.named_steps["Preprocessor"], DF.columns)

# Convert transformed data back to DataFrame
df_processed = pd.DataFrame(df_processed, columns=transformed_column_names)
df_processed.head(3)

In [None]:
# ## Rename correctly:
# names = {'binary_0':'HasCoSigner', "binary_1":'HasDependents', "binary_2":'HasMortgage',
#    'Label->LE_0':'EmploymentType', "Age Transformer_0":'Age',}
# df_processed = df_processed.rename(columns=names)
# df_processed.head(3)