## Preprocessing

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

**Loading Cleaned Dataset**

In [2]:
cleaned_train_df = pd.read_csv("cleaned_train_df.csv")
cleaned_test_df = pd.read_csv("cleaned_test_df.csv")

cleaned_train2 = cleaned_train_df.copy()
cleaned_test2 = cleaned_test_df.copy()

In [3]:
print(cleaned_train_df.shape)
print(cleaned_train_df.columns)
cleaned_train_df.head()

(614, 15)
Index(['Unnamed: 0', 'Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'LoanAmount_Clipped'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,LoanAmount_Clipped
0,0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y,128.0
1,1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,128.0
2,2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,66.0
3,3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,120.0
4,4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,141.0


**Encoding Variables**

In [4]:
# target variable
cleaned_train_df['Loan_Status'] = cleaned_train_df['Loan_Status'].map({'Y': 1, 'N': 0})

# categorical variable
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
encoded_train_df = pd.get_dummies(cleaned_train_df, columns = categorical_cols, drop_first = True)

**Splitting Features & Target**

In [5]:
X = encoded_train_df.drop('Loan_Status', axis = 1)
y = encoded_train_df['Loan_Status']

**Splitting into Training & Validation Sets**

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

**Scaling Numerical Columns**

In [10]:
X.dtypes

Unnamed: 0                   int64
Loan_ID                     object
ApplicantIncome              int64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
LoanAmount_Clipped         float64
Gender_Male                   bool
Married_Yes                   bool
Dependents_1                  bool
Dependents_2                  bool
Dependents_3+                 bool
Education_Not Graduate        bool
Self_Employed_Yes             bool
Property_Area_Semiurban       bool
Property_Area_Urban           bool
dtype: object

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# need to drop LoanID

ValueError: could not convert string to float: 'LP001535'

**Building & Training Baseline Models**

In [None]:
log_reg = LogisticRegression(max_iter = 1000)
log_reg.fit(X_train_scaled, y_train)
y_pred_logreg = log_reg.predict(X_val_scaled)

**Decision Tree**

In [None]:
decision_tree = DecisionTreeClassifier(max_depth = 4, random_state = 42)
decision_tree.fit(X_train, y_train)
y_pred_decision_tree = decision_tree.predict(X_val)

**Random Forest**

In [None]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 42)
random_forest.fit(X_train, y_train)
y_pred_random_forest = random_forest.predict(X_val)

**Evaluating Model Performance**

In [None]:
def evaluate_model(name, y_true, y_pred):
    print(f"{name} Model Performance: ")
    print(f"Accuary: {round(accuracy_score(y_true, y_pred), 3)}")
    print(f"Precision: {round(precision_score(y_true, y_pred), 3)}")
    print(f"Recall: {round(recall_score(y_true, y_pred), 3)}")
    print(f"F1 Score: {round(f1_score(y_true, y_pred), 3)}")
    print(f"\nConfusion Matrix:\n{confusion_matrix(y_true, y_pred)}")
    print(f"\nClassification Report:\n{classification_report(y_true, y_pred)}")

evaluate_model("Logistic Regression", y_val, y_pred_logreg)
evaluate_model("Decision Tree", y_val, y_pred_decision_tree)
evaluate_model("Random Forest", y_val, y_pred_random_forest)