<a href="https://colab.research.google.com/github/jorden17/loan_approval_prediction/blob/main/loan_approval_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [3]:
train_data = pd.read_csv('/content/Training Dataset.csv')
test_data = pd.read_csv('/content/Test Dataset.csv')

In [6]:
print("train\n",train_data.head())
print("test\n",test_data.head())

train
     Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2            

In [7]:
print(train_data.isnull().sum())
print(test_data.isnull().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64
Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64


In [8]:
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

In [10]:
train_data['LoanAmount'].fillna(train_data['LoanAmount'].median(), inplace=True)
test_data['LoanAmount'].fillna(test_data['LoanAmount'].median(), inplace=True)

In [11]:
train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].median(), inplace=True)
test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].median(), inplace=True)

In [12]:
# Encode categorical features
train_data = pd.get_dummies(train_data, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)

In [13]:
# Ensure the columns in the test set match the train set
missing_cols = set(train_data.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[train_data.columns.drop('Loan_Status')]

In [14]:
# Define target and features
X_train = train_data.drop(['Loan_ID', 'Loan_Status'], axis=1)
y_train = train_data['Loan_Status'].map({'Y': 1, 'N': 0})

X_test = test_data.drop(['Loan_ID'], axis=1)

In [15]:
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [16]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC(),
    'k-Nearest Neighbors': KNeighborsClassifier()
}

In [17]:
# Evaluate models using cross-validation
for name, model in models.items():
    model.fit(X_train_split, y_train_split)
    val_preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, val_preds)
    print(f"{name}: Validation Accuracy = {accuracy}")

Logistic Regression: Validation Accuracy = 0.7886178861788617
Random Forest: Validation Accuracy = 0.7804878048780488
Decision Tree: Validation Accuracy = 0.6829268292682927
Support Vector Machine: Validation Accuracy = 0.6504065040650406
k-Nearest Neighbors: Validation Accuracy = 0.5772357723577236


In [18]:
best_model = LogisticRegression()
best_model.fit(X_train, y_train)


In [19]:
test_preds = best_model.predict(X_test)

In [20]:
submission = pd.DataFrame({'Loan_ID': test_data['Loan_ID'], 'Loan_Status': test_preds})
submission['Loan_Status'] = submission['Loan_Status'].map({1: 'Y', 0: 'N'})
submission.to_csv('loan_approval_predictions.csv', index=False)


In [21]:
print("Loan approval prediction completed and saved to 'loan_approval_predictions.csv'")

Loan approval prediction completed and saved to 'loan_approval_predictions.csv'


In [24]:
print(train_data.head())

    Loan_ID  ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0  LP001002             5849                0.0       128.0             360.0   
1  LP001003             4583             1508.0       128.0             360.0   
2  LP001005             3000                0.0        66.0             360.0   
3  LP001006             2583             2358.0       120.0             360.0   
4  LP001008             6000                0.0       141.0             360.0   

   Credit_History Loan_Status  Gender_Male  Married_Yes  Dependents_1  \
0             1.0           Y         True        False         False   
1             1.0           N         True         True          True   
2             1.0           Y         True         True         False   
3             1.0           Y         True         True         False   
4             1.0           Y         True        False         False   

   Dependents_2  Dependents_3+  Education_Not Graduate  Self_Employed_Yes 