In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [38]:
df_train = pd.read_csv('Dataset/train.csv')
df_test = pd.read_csv('Dataset/test.csv')

In [39]:
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [40]:
df_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [41]:
df_X_train = df_train.drop(columns=['Loan_Status']).copy()
df_X_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [42]:
df_y_train = df_train[['Loan_ID', 'Loan_Status']].copy()
df_y_train.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001002,Y
1,LP001003,N
2,LP001005,Y
3,LP001006,Y
4,LP001008,Y


In [43]:
df_X_test = df_test.copy()
df_X_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


### EDA

In [44]:
def fillna_proportional(df, column):
    vcount = df[column].value_counts()
    proportion = vcount / vcount.sum()
    missing_count = df[column].isna().sum()
    to_fill_count = (proportion * missing_count).round().astype(int)

    while to_fill_count.sum() != missing_count:
        if to_fill_count.sum() < missing_count:
            to_fill_count[to_fill_count.idxmax()] += 1
        elif to_fill_count.sum() > missing_count:
            to_fill_count[to_fill_count.idxmin()] -= 1

    na_indices = df[df[column].isna()].index
    fill_values = np.concatenate([np.repeat(index, count)
                                 for index, count in to_fill_count.items()])
    np.random.shuffle(fill_values)
    df.loc[na_indices, column] = fill_values

    return df

In [45]:
def outlier_capping(data, upper_quantile, lower_quantile):
    upper_limit = data.quantile(upper_quantile)
    lower_limit = data.quantile(lower_quantile)
    # Cap the values using clip
    return data.clip(lower=lower_limit, upper=upper_limit)

In [46]:
from sklearn.preprocessing import StandardScaler

def preprocess(df, ID, treat_na=False, treat_outliers=False, upper_quantile=0.99, lower_quantile=0.01):
    # Step 1: Drop columns with excessive missing values
    missing_percentage = df.isnull().mean()
    columns_to_drop = missing_percentage[missing_percentage > 0.4].index.tolist()
    df = df.drop(columns=columns_to_drop)

    # Step 2: Identify numeric and categorical columns
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
    
    # Remove the ID column from categorical columns if it exists
    if ID in cat_cols:
        cat_cols.remove(ID)

    # Step 3: Normalize string categorical data - lowercase categorical columns
    df[cat_cols] = df[cat_cols].apply(lambda x: x.str.lower())

    # Step 4: Identify and map binary columns
    bin_cols = []
    for col in num_cols + cat_cols:
        if df[col].nunique() == 2:
            bin_cols.append(col)
            unique_vals = sorted(df[col].dropna().unique())  # Store unique non-null values in sorted order
            df[col] = df[col].map(lambda x: 1 if x == unique_vals[1] else (0 if x == unique_vals[0] else x))
            df[col] = df[col].astype('Int64') # Cast to int to ensure it's not float
    
    # Remove bin_cols from num_cols and cat_cols
    num_cols = [col for col in num_cols if col not in bin_cols]
    cat_cols = [col for col in cat_cols if col not in bin_cols]
    
    # print(f"Categorical Columns: {cat_cols}, \nBinary Columns: {bin_cols}, \nNumeric Columns: {num_cols}")

    # Step 5: Handle missing values if treat_na is True
    if treat_na:
        # Categorical and Binary Columns
        for col in (cat_cols + bin_cols):
            df = fillna_proportional(df, col)
        # Numerical Columns
        for num in num_cols:
            df[num] = df[num].fillna(df[num].median())

    # Step 6: Outlier treatment if treat_outliers is True
    if treat_outliers:
        for col in num_cols:
            df[col] = outlier_capping(df[col], upper_quantile, lower_quantile)
    
    # Step 7: Convert categorical columns to dummy variables
    df = pd.get_dummies(df ,columns=cat_cols, drop_first=True, dtype='int')

    # Step 8: Normalize Data
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df, scaler


In [47]:
df_X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [89]:
df_X_train_cleaned, scaler = preprocess(df_X_train, ID='Loan_ID',treat_na=True, treat_outliers=True)
df_X_train_cleaned.head()

Unnamed: 0,Loan_ID,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Dependents_2,Dependents_3+,Property_Area_semiurban,Property_Area_urban
0,LP001002,1,0,0,0,0.140201,-0.810858,-0.213994,0.274463,1,0,0,0,0,1
1,LP001003,1,1,0,0,-0.139573,0.013305,-0.213994,0.274463,1,1,0,0,0,0
2,LP001005,1,1,0,1,-0.489401,-0.810858,-1.004311,0.274463,1,0,0,0,0,1
3,LP001006,1,1,1,0,-0.581554,0.477853,-0.31597,0.274463,1,0,0,0,0,1
4,LP001008,1,0,0,0,0.173571,-0.810858,-0.048282,0.274463,1,0,0,0,0,1


In [90]:
# df_X_train_cleaned.describe([.01,.02,.03,.04,.05,.95,.96,.97,.98,.99])

In [91]:
df_X_test_cleaned, _ = preprocess(df_X_test, ID='Loan_ID',treat_na=True, treat_outliers=True)
df_X_test_cleaned.head()

Unnamed: 0,Loan_ID,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Dependents_2,Dependents_3+,Property_Area_semiurban,Property_Area_urban
0,LP001015,1,1,0,0,0.35984,-0.762204,-0.448487,0.268027,1,0,0,0,0,1
1,LP001022,1,1,0,0,-0.49615,-0.01073,-0.163351,0.268027,1,1,0,0,0,1
2,LP001031,1,1,0,0,0.126741,0.139565,1.297971,0.268027,1,0,1,0,0,1
3,LP001035,1,1,0,0,-0.734429,0.513299,-0.626697,0.268027,1,0,1,0,0,1
4,LP001051,1,0,1,0,-0.431401,-0.762204,-1.018759,0.268027,1,0,0,0,0,1


In [92]:
df_X_train_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Loan_ID                  614 non-null    object 
 1   Gender                   614 non-null    Int64  
 2   Married                  614 non-null    Int64  
 3   Education                614 non-null    Int64  
 4   Self_Employed            614 non-null    Int64  
 5   ApplicantIncome          614 non-null    float64
 6   CoapplicantIncome        614 non-null    float64
 7   LoanAmount               614 non-null    float64
 8   Loan_Amount_Term         614 non-null    float64
 9   Credit_History           614 non-null    Int64  
 10  Dependents_1             614 non-null    int64  
 11  Dependents_2             614 non-null    int64  
 12  Dependents_3+            614 non-null    int64  
 13  Property_Area_semiurban  614 non-null    int64  
 14  Property_Area_urban      6

In [93]:
df_X_train.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

In [94]:
df_X_train_cleaned.isna().sum()

Loan_ID                    0
Gender                     0
Married                    0
Education                  0
Self_Employed              0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Dependents_1               0
Dependents_2               0
Dependents_3+              0
Property_Area_semiurban    0
Property_Area_urban        0
dtype: int64

In [95]:
df_X_test.isna().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [96]:
df_X_test_cleaned.isna().sum()

Loan_ID                    0
Gender                     0
Married                    0
Education                  0
Self_Employed              0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Dependents_1               0
Dependents_2               0
Dependents_3+              0
Property_Area_semiurban    0
Property_Area_urban        0
dtype: int64

### Model Creation

In [97]:
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn import metrics

In [98]:
X_train = df_X_train_cleaned.drop(columns=['Loan_ID']).copy()
X_test = df_X_test_cleaned.drop(columns=['Loan_ID']).copy()

In [99]:
X_train.head()

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Dependents_2,Dependents_3+,Property_Area_semiurban,Property_Area_urban
0,1,0,0,0,0.140201,-0.810858,-0.213994,0.274463,1,0,0,0,0,1
1,1,1,0,0,-0.139573,0.013305,-0.213994,0.274463,1,1,0,0,0,0
2,1,1,0,1,-0.489401,-0.810858,-1.004311,0.274463,1,0,0,0,0,1
3,1,1,1,0,-0.581554,0.477853,-0.31597,0.274463,1,0,0,0,0,1
4,1,0,0,0,0.173571,-0.810858,-0.048282,0.274463,1,0,0,0,0,1


In [100]:
df_y_train.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001002,Y
1,LP001003,N
2,LP001005,Y
3,LP001006,Y
4,LP001008,Y


In [101]:
mapping = {
    'Y': 1,
    'N': 0
}
y_train = df_y_train['Loan_Status'].map(mapping)
y_train.head()

0    1
1    0
2    1
3    1
4    1
Name: Loan_Status, dtype: int64

In [102]:
model_logist = LogisticRegression()
model_logist.fit(X_train, y_train)

In [103]:
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

In [104]:
# Save the model
import joblib
import os
model_path = './model/model.pkl'
scaler_path = './model/scaler.pkl'
os.makedirs(os.path.dirname(model_path), exist_ok=True)
joblib.dump(model_xgb, model_path)
joblib.dump(scaler, scaler_path)
print(f"Model saved to: {model_path} \nScaler saved to: {scaler_path}")

Model saved to: ./model/model.pkl 
Scaler saved to: ./model/scaler.pkl


In [105]:
y_pred_train_logist = model_logist.predict(X_train)
y_pred_test_logist = model_logist.predict(X_test)

y_pred_train_xgb = model_xgb.predict(X_train)
y_pred_test_xgb = model_xgb.predict(X_test)

In [106]:
report_logist = metrics.classification_report(y_train, y_pred_train_logist)
print(report_logist)

              precision    recall  f1-score   support

           0       0.88      0.44      0.59       192
           1       0.79      0.97      0.87       422

    accuracy                           0.81       614
   macro avg       0.83      0.71      0.73       614
weighted avg       0.82      0.81      0.78       614



In [107]:
report_xgb = metrics.classification_report(y_train, y_pred_train_xgb)
print(report_xgb)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       192
           1       1.00      1.00      1.00       422

    accuracy                           1.00       614
   macro avg       1.00      1.00      1.00       614
weighted avg       1.00      1.00      1.00       614



In [108]:
predictions = pd.DataFrame({
    'Loan_ID': df_X_test['Loan_ID'],
    'Loan_Status': y_pred_test_xgb
})
predictions.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,1
1,LP001022,1
2,LP001031,1
3,LP001035,1
4,LP001051,0


In [109]:
# remap the output
mapping = {
    1: 'Y',
    0: 'N'
}

predictions['Loan_Status'] = predictions['Loan_Status'].map(mapping)
predictions.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,N


In [110]:
pred_path = './Predictions/predictions.csv'
os.makedirs(os.path.dirname(pred_path), exist_ok=True)

predictions.to_csv(pred_path, index=False)