# **Loan Status Prediction**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

from xgboost import XGBClassifier

In [None]:
df = pd.read_csv('data/data.csv')
df.head(10)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [7]:
df = df.dropna()

In [None]:
df.shape

In [None]:
categorial_col = [feature for feature in df.columns if df[feature].dtype == 'object']
categorial_col

In [None]:
numerical_col = [feature for feature in df.columns if df[feature].dtype != 'object']
numerical_col

In [None]:
for col in categorial_col:
    print(df[col].value_counts())
    print('\n')

In [12]:
df = df.replace(to_replace='3+', value=4)

## **Data Visualization**

In [None]:
for col in categorial_col:
    if col != 'Loan_ID':
        sns.countplot(x=col,hue='Loan_Status',data=df)
        plt.show()

In [None]:
df.replace({'Married':{'No':0,'Yes':1},'Gender':{'Male':1,'Female':0},'Self_Employed':{'No':0,'Yes':1},
                      'Property_Area':{'Rural':0,'Semiurban':1,'Urban':2},'Education':{'Graduate':1,'Not Graduate':0}},inplace=True)

## **Training with Model Implementation**

In [15]:
X = df.drop(columns=['Loan_ID', 'Loan_Status'], axis=1)
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
X_train.shape

### **Logistic Regression**

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],             
    'penalty': ['l1'],  
    'solver': ['liblinear', 'saga'],           
    'max_iter': [100, 200, 500],  
    'class_weight': [None, 'balanced'],
}

lr_model = LogisticRegression()
grid_search_lr = GridSearchCV(lr_model, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_lr.fit(X_train, y_train) 


prediction_data_lr = grid_search_lr.predict(X_test)

print("Best parameters found: ", grid_search_lr.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_lr.best_score_))
print("Test accuracy: {:.3f}".format(accuracy_score(y_test, prediction_data_lr)))

### **Random Forest**

In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],          
    'max_depth': [None], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4], 
    'bootstrap': [True],
    'class_weight': ['balanced']
}

model_rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(model_rf, param_grid_rf, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

prediction_data_rf = grid_search_rf.predict(X_test)


print("Best parameters found: ", grid_search_rf.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_rf.best_score_))
print("Test accuracy: {:.3f}".format(accuracy_score(y_test, prediction_data_rf)))


### **Gradient Boosting**

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],    
    'learning_rate': [0.01, 0.1, 0.2], 
    'max_depth': [3, 5, 7]     
}

grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)
predictions_gb = grid_search_gb.predict(X_test)

print("Best parameters found: ", grid_search_rf.best_params_)
print("Best cross-validation accuracy: {:.3f}".format(grid_search_rf.best_score_))
print("Test accuracy: {:.3f}".format(accuracy_score(y_test, prediction_data_rf)))

### **SVC**

In [None]:
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
predictions_svc_test = svc.predict(X_test)

print("Test accuracy (Test): {:.3f}".format(accuracy_score(y_test, predictions_svc_test)))