In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [138]:
train_df = pd.read_csv("https://raw.githubusercontent.com/subashgandyer/datasets/main/loan_train.csv")
test_df = pd.read_csv("https://raw.githubusercontent.com/subashgandyer/datasets/main/loan_test.csv")

In [139]:
train_X = train_df.drop(['Loan_ID','Loan_Status'], axis=1)
train_y = train_df[['Loan_Status']]

test_X = test_df.drop(['Loan_ID'], axis=1)

In [140]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 52.9+ KB


In [141]:
train_cat = train_X[['Gender','Married','Dependents','Education','Self_Employed','Property_Area']]
catcolumns = train_cat.columns

train_cat = pd.get_dummies(train_X[catcolumns], dtype=int)

In [142]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_num = train_X.drop(catcolumns, axis=1)
numcolumns = train_num.columns

In [143]:
train_num = scaler.fit_transform(train_num)
train_num = pd.DataFrame(train_num, columns = numcolumns)
train_X = pd.concat([train_num, train_cat], axis = 1)

In [144]:
train_X.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.070489,0.0,,0.74359,1.0,0,1,1,0,1,0,0,0,1,0,1,0,0,0,1
1,0.05483,0.036192,0.172214,0.74359,1.0,0,1,0,1,0,1,0,0,1,0,1,0,1,0,0
2,0.03525,0.0,0.082489,0.74359,1.0,0,1,0,1,1,0,0,0,1,0,0,1,0,0,1
3,0.030093,0.056592,0.160637,0.74359,1.0,0,1,0,1,1,0,0,0,0,1,1,0,0,0,1
4,0.072356,0.0,0.191027,0.74359,1.0,0,1,1,0,1,0,0,0,1,0,1,0,0,0,1


In [145]:
cols = train_X.columns
train_X.isna().sum()

ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                 22
Loan_Amount_Term           14
Credit_History             50
Gender_Female               0
Gender_Male                 0
Married_No                  0
Married_Yes                 0
Dependents_0                0
Dependents_1                0
Dependents_2                0
Dependents_3+               0
Education_Graduate          0
Education_Not Graduate      0
Self_Employed_No            0
Self_Employed_Yes           0
Property_Area_Rural         0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [146]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp.fit(train_X)
X_transformed = imp.transform(train_X)
X_transformed = pd.DataFrame(X_transformed, columns=cols)
X_transformed.isna().sum() # This should be 0 after transformation

ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Gender_Female              0
Gender_Male                0
Married_No                 0
Married_Yes                0
Dependents_0               0
Dependents_1               0
Dependents_2               0
Dependents_3+              0
Education_Graduate         0
Education_Not Graduate     0
Self_Employed_No           0
Self_Employed_Yes          0
Property_Area_Rural        0
Property_Area_Semiurban    0
Property_Area_Urban        0
dtype: int64

In [147]:
X_transformed.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.070489,0.0,0.19886,0.74359,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.05483,0.036192,0.172214,0.74359,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.03525,0.0,0.082489,0.74359,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.030093,0.056592,0.160637,0.74359,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,0.072356,0.0,0.191027,0.74359,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


DT - Decision Tree

KNN - K Nearest Neighbour

LR - Logistic Regression

SVM - Support Vector Machine

RF - Random Forest

In [148]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [149]:
models = [DecisionTreeClassifier(),
          KMeans(),
          LogisticRegression(),
          SVC(),
          RandomForestClassifier(),
          KNeighborsClassifier()]

dt_params = {'criterion': ['entropy','gini','log_loss'],
             'splitter': ['random','best'],
             'max_depth': [1,2,3,4,5],
             'min_samples_split': [2,4,6,8,10],
             'min_samples_leaf': [1,2,3,4,5]}

km_params = {'n_clusters': [2],
             'max_iter': [100,200,500,1000],
             'tol': [0.0001,0.001,0.01,0.1]}

lr_params = {'max_iter': [50,100,150,200],
             'penalty': ['l1','l2','none'],
             'tol': [0.001,0.01,0.1],
             'C': [1, 10,20, 50, 100]}

svm_params = {'C': [0.01,0.1,1, 10, 100, 100], 
              'gamma': [10,1,0.1,0.01,0.001,0.0001], 
              'kernel': ['rbf','poly','linear']}

rf_params = {'n_estimators': [10,20,30,40,50,100,150,200],
             'max_depth': [1,2,3,4,5],
             'min_samples_leaf': [1,2,3,4,5],
             'min_samples_split': [2,4,6,8,10]}

knn_params = {'n_neighbors': [2,3],
              'weights': ['uniform','distance'],
              'algorithm': ['auto','ball_tree','kd_tree'],
              }

param_grid = [dt_params, km_params, lr_params, svm_params, rf_params, knn_params]

In [150]:
train_X = X_transformed

In [151]:
train_y.Loan_Status.replace(('Y', 'N'), (1, 0), inplace=True)
train_y.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_y.Loan_Status.replace(('Y', 'N'), (1, 0), inplace=True)


Unnamed: 0,Loan_Status
0,1
1,0
2,1
3,1
4,1


In [152]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

In [153]:
%%capture
param = 0
accuracies = []
precisions = []
recalls = []
f1_scores = []
best_params = []
best_estimator = []
for m in models:
    grid = GridSearchCV(estimator=m, param_grid=param_grid[param] ,refit=True,verbose=3)
    grid.fit(train_X, train_y)
    predict_y = grid.predict(train_X)

    acc = accuracy_score(train_y, predict_y)
    accuracies.append(acc)

    pre = precision_score(train_y, predict_y)
    precisions.append(pre)

    rec = recall_score(train_y, predict_y)
    recalls.append(rec)

    f1s = f1_score(train_y, predict_y)
    f1_scores.append(f1s)


    best_params.append(grid.best_params_ )
    best_estimator.append(grid.best_estimator_)
    param = param + 1

In [154]:
precisions

[0.7938931297709924,
 0.6308411214953271,
 0.7942307692307692,
 0.7904761904761904,
 0.7900763358778626,
 0.8421052631578947]

In [155]:
results = pd.DataFrame(columns=['Algorithm','Accuracy','Precision','Recall','F1_Score','Best Parameters'])
results['Algorithm'] = models
results['Accuracy'] = accuracies
results['Precision'] = precisions
results['Recall'] = recalls
results['F1_Score'] = f1_scores
results['Best Parameters'] = best_params

pd.set_option('max_colwidth', 400)

results

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1_Score,Best Parameters
0,DecisionTreeClassifier(),0.814332,0.793893,0.985782,0.879493,"{'criterion': 'log_loss', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 8, 'splitter': 'random'}"
1,KMeans(),0.403909,0.630841,0.319905,0.424528,"{'max_iter': 100, 'n_clusters': 2, 'tol': 0.0001}"
2,LogisticRegression(),0.811075,0.794231,0.978673,0.876858,"{'C': 1, 'max_iter': 100, 'penalty': 'none', 'tol': 0.1}"
3,SVC(),0.809446,0.790476,0.983412,0.876452,"{'C': 0.1, 'gamma': 10, 'kernel': 'linear'}"
4,RandomForestClassifier(),0.807818,0.790076,0.981043,0.875264,"{'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 10}"
5,KNeighborsClassifier(),0.84202,0.842105,0.947867,0.891862,"{'algorithm': 'ball_tree', 'n_neighbors': 3, 'weights': 'uniform'}"


In [156]:
max_accuracy = []
max_accuracy_algo = []
max_accuracy_params = []


max_accuracy.append(max(accuracies))
max_accuracy_index = accuracies.index(max(accuracies))
max_accuracy_algo.append(models[max_accuracy_index])
max_accuracy_params.append(best_params[max_accuracy_index])

In [157]:
max_accuracy

[0.8420195439739414]

In [158]:
best_results = pd.DataFrame(columns=['Algorithm','Accuracy','Best Parameters'])
best_results['Algorithm'] = max_accuracy_algo
best_results['Accuracy'] = max_accuracy
best_results['Best Parameters'] = max_accuracy_params

pd.set_option('max_colwidth', 400)

best_results

Unnamed: 0,Algorithm,Accuracy,Best Parameters
0,KNeighborsClassifier(),0.84202,"{'algorithm': 'ball_tree', 'n_neighbors': 3, 'weights': 'uniform'}"
