In [8]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
plt.style.use(['fivethirtyeight'])

%matplotlib inline

#EDA
#pip install -U pandas-profiling[notebook]
from pandas_profiling import ProfileReport
# how to use it
#profile = ProfileReport(df, title='Pandas Profiling Report')

import klib

#How to use it
#klib.missingval_plot(features)
#klib.corr_plot(features, annot=False, figsize=(15,12))
#klib.corr_plot(features, split='high', annot=False, figsize=(15,12))
#klib.cat_plot(features)

#default theme
sns.set(context='notebook', style='darkgrid', palette='deep', font='sans-serif', font_scale=1, color_codes=False, rc=None)

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

#Cross-validation
from sklearn.model_selection import StratifiedKFold, KFold

#Ensembling
from sklearn.ensemble import VotingClassifier
from vecstack import StackingTransformer
from vecstack import stacking

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Utility
import os
import time
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform

# <span style="color:green"> Objective: </span>
### <span style="color:green"> Predict which of the customers will have their loan approved. </span>

### Let's get the data

In [2]:
train = pd.read_csv(r"C:\Users\MSI\Desktop\Data Science\Python\Projects\Loan Prediction\train.csv")
test = pd.read_csv(r"C:\Users\MSI\Desktop\Data Science\Python\Projects\Loan Prediction\test.csv")

### Briefly check the data

In [3]:
print("train:", train.size, "\ntest:", test.size)

train: 7982 
test: 4404


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [75]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


#### We have up to 7! object type features. We will have to deal with them later.

In [16]:
train.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [17]:
test.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
5,LP001054,Male,Yes,0,Not Graduate,Yes,2165,3422,152.0,360.0,1.0,Urban
6,LP001055,Female,No,1,Not Graduate,No,2226,0,59.0,360.0,1.0,Semiurban
7,LP001056,Male,Yes,2,Not Graduate,No,3881,0,147.0,360.0,0.0,Rural
8,LP001059,Male,Yes,2,Graduate,,13633,0,280.0,240.0,1.0,Urban
9,LP001067,Male,No,0,Not Graduate,No,2400,2400,123.0,360.0,1.0,Semiurban


In [39]:
train.describe(include="all")

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
count,614,601,611,599.0,614,582,614.0,614.0,592.0,600.0,564.0,614,614
unique,614,2,2,4.0,2,2,,,,,,3,2
top,LP001907,Male,Yes,0.0,Graduate,No,,,,,,Semiurban,Y
freq,1,489,398,345.0,480,500,,,,,,233,422
mean,,,,,,,5403.459283,1621.245798,146.412162,342.0,0.842199,,
std,,,,,,,6109.041673,2926.248369,85.587325,65.12041,0.364878,,
min,,,,,,,150.0,0.0,9.0,12.0,0.0,,
25%,,,,,,,2877.5,0.0,100.0,360.0,1.0,,
50%,,,,,,,3812.5,1188.5,128.0,360.0,1.0,,
75%,,,,,,,5795.0,2297.25,168.0,360.0,1.0,,


In [40]:
test.describe(include= "all")

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
count,367,356,367,357.0,367,344,367.0,367.0,362.0,361.0,338.0,367
unique,367,2,2,4.0,2,2,,,,,,3
top,LP002165,Male,Yes,0.0,Graduate,No,,,,,,Urban
freq,1,286,233,200.0,283,307,,,,,,140
mean,,,,,,,4805.599455,1569.577657,136.132597,342.537396,0.825444,
std,,,,,,,4910.685399,2334.232099,61.366652,65.156643,0.38015,
min,,,,,,,0.0,0.0,28.0,6.0,0.0,
25%,,,,,,,2864.0,0.0,100.25,360.0,1.0,
50%,,,,,,,3786.0,1025.0,125.0,360.0,1.0,
75%,,,,,,,5060.0,2430.5,158.0,360.0,1.0,


## (quick) Data Visualization

In [9]:
#check the EDA part where all the packages are
profile = ProfileReport(train, title = "Train data", dark_mode= True)
profile

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=27.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))






    The great thing about this tool is that is quickly lets you see all the analysis we did before.It presents it to you in an simple and easy to read way. Of course, it gives you an overall report, not a specific relationship you can find by making your own graphs. 

In [37]:
#Nan values
train.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [38]:
test.isna().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

#### Okay! there are some Nan values and we have the object type.  That just means one thing:

## Data cleaning!

### Strings

From the .head function, We have four string columns: Gender, Married,Education and Property area. Let's check how they are.

In [72]:
print(f'TRAIN DATA \nGender: \n{train["Gender"].value_counts()},\nMarried: {train["Married"].value_counts()},\nEducation: {train["Education"].value_counts()}, \nProperty: {train["Property_Area"].value_counts()}')

TRAIN DATA 
Gender: 
Male      489
Female    112
Name: Gender, dtype: int64,
Married: Yes    398
No     213
Name: Married, dtype: int64,
Education: Graduate        480
Not Graduate    134
Name: Education, dtype: int64, 
Property: Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64


In [73]:
print(f'TEST DATA \nGender: \n{test["Gender"].value_counts()},\nMarried: {test["Married"].value_counts()},\nEducation: {test["Education"].value_counts()}, \nProperty: {test["Property_Area"].value_counts()}')

TEST DATA 
Gender: 
Male      286
Female     70
Name: Gender, dtype: int64,
Married: Yes    233
No     134
Name: Married, dtype: int64,
Education: Graduate        283
Not Graduate     84
Name: Education, dtype: int64, 
Property: Urban        140
Semiurban    116
Rural        111
Name: Property_Area, dtype: int64


We will deal with them with the excellent code from Yonatan Rabinovich, on his "Loan Prediction Dataset ML Project" notebook.


check it here: https://www.kaggle.com/yonatanrabinovich/loan-prediction-dataset-ml-project

We could use panda's pd.get_dummies to create dummies out of the categorical values. We may make a comparison between that method and assigning a number to the categories.

In [10]:
#converting categorical values to numbers

to_numeric = {'Male': 1, 'Female': 2,
'Yes': 1, 'No': 2,
'Graduate': 1, 'Not Graduate': 2,
'Urban': 3, 'Semiurban': 2,'Rural': 1,
'Y': 1, 'N': 0,
'3+': 3}

# adding the new numeric values from the to_numeric variable to both datasets
train = train.applymap(lambda lable: to_numeric.get(lable) if lable in to_numeric else lable)
test = test.applymap(lambda lable: to_numeric.get(lable) if lable in to_numeric else lable)

# convertind the Dependents column
Dependents_ = pd.to_numeric(train.Dependents)
Dependents__ = pd.to_numeric(test.Dependents)

# dropping the previous Dependents column
train.drop(['Dependents'], axis = 1, inplace = True)
test.drop(['Dependents'], axis = 1, inplace = True)

# concatination of the new Dependents column with both datasets
train = pd.concat([train, Dependents_], axis = 1)
test = pd.concat([test, Dependents__], axis = 1)

# checking the our manipulated dataset for validation
print(f"training set (row, col): {train.shape}\n\ntesting set (row, col): {test.shape}\n")
print(train.info(), "\n\n", test.info())

training set (row, col): (614, 13)

testing set (row, col): (367, 12)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    float64
 2   Married            611 non-null    float64
 3   Education          614 non-null    int64  
 4   Self_Employed      582 non-null    float64
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    int64  
 11  Loan_Status        614 non-null    int64  
 12  Dependents         599 non-null    float64
dtypes: float64(8), int64(4), object(1)
memory usage: 62.5+ KB
<class 'pandas.core.frame.DataFrame'>
Ran

### Nan

For this, we can: 

1. Get rid of the corresponding nan values.
2. Get rid of the whole feature.
3. Set the values to some value (zero, the mean, the median, etc.).

Remember:
    
    Theoretically, 25 to 30% is the maximum missing values are allowed, beyond which we might want to drop the variable from analysis. 
    
 in this case is no problem, but is a nice reminder

In [11]:
from sklearn.impute import SimpleImputer

In [12]:
#let's imput with the meadian for numeric features using the median
for_numeric = SimpleImputer(strategy= 'median')
a = for_numeric.fit_transform(train[["LoanAmount", "Loan_Amount_Term", "Credit_History"]])
a = pd.DataFrame(a,columns= ["LoanAmount", "Loan_Amount_Term", "Credit_History"])
train[["LoanAmount", "Loan_Amount_Term", "Credit_History"]] = a

In [13]:
train.isna().sum()

Loan_ID               0
Gender               13
Married               3
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
Dependents           15
dtype: int64

In [14]:
#Now let's imput the object type features with most_frequent categorical varaibles
for_object = SimpleImputer(strategy= "most_frequent")
b = for_object.fit_transform(train[["Gender", 'Married', "Dependents", "Self_Employed"]])
b = pd.DataFrame(b, columns= ["Gender", 'Married', "Dependents", "Self_Employed"])
train[["Gender", 'Married', "Dependents", "Self_Employed"]] = b

In [15]:
train.isna().sum()

Loan_ID              0
Gender               0
Married              0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
Dependents           0
dtype: int64

Now, the same but with the TEST dataset

In [16]:
for_numeric = SimpleImputer(strategy= 'median')
a = for_numeric.fit_transform(test[["LoanAmount", "Loan_Amount_Term", "Credit_History"]])
a = pd.DataFrame(a,columns= ["LoanAmount", "Loan_Amount_Term", "Credit_History"])
test[["LoanAmount", "Loan_Amount_Term", "Credit_History"]] = a

In [17]:
for_object = SimpleImputer(strategy= "most_frequent")
b = for_object.fit_transform(test[["Gender", 'Married', "Dependents", "Self_Employed"]])
b = pd.DataFrame(b, columns= ["Gender", 'Married', "Dependents", "Self_Employed"])
test[["Gender", 'Married', "Dependents", "Self_Employed"]] = b

In [18]:
test.isna().sum()

Loan_ID              0
Gender               0
Married              0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Dependents           0
dtype: int64

In [19]:
#Let's quickly drop "Loan_ID" since we don't need it
train = train.drop("Loan_ID", axis= 1)
train

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Dependents
0,1.0,2.0,1,2.0,5849,0.0,128.0,360.0,1.0,3,1,0.0
1,1.0,1.0,1,2.0,4583,1508.0,128.0,360.0,1.0,1,0,1.0
2,1.0,1.0,1,1.0,3000,0.0,66.0,360.0,1.0,3,1,0.0
3,1.0,1.0,2,2.0,2583,2358.0,120.0,360.0,1.0,3,1,0.0
4,1.0,2.0,1,2.0,6000,0.0,141.0,360.0,1.0,3,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
609,2.0,2.0,1,2.0,2900,0.0,71.0,360.0,1.0,1,1,0.0
610,1.0,1.0,1,2.0,4106,0.0,40.0,180.0,1.0,1,1,3.0
611,1.0,1.0,1,2.0,8072,240.0,253.0,360.0,1.0,3,1,1.0
612,1.0,1.0,1,2.0,7583,0.0,187.0,360.0,1.0,3,1,2.0


In [20]:
test = test.drop("Loan_ID", axis = 1)
test

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Dependents
0,1.0,1.0,1,2.0,5720,0,110.0,360.0,1.0,3,0.0
1,1.0,1.0,1,2.0,3076,1500,126.0,360.0,1.0,3,1.0
2,1.0,1.0,1,2.0,5000,1800,208.0,360.0,1.0,3,2.0
3,1.0,1.0,1,2.0,2340,2546,100.0,360.0,1.0,3,2.0
4,1.0,2.0,2,2.0,3276,0,78.0,360.0,1.0,3,0.0
...,...,...,...,...,...,...,...,...,...,...,...
362,1.0,1.0,2,1.0,4009,1777,113.0,360.0,1.0,3,3.0
363,1.0,1.0,1,2.0,4158,709,115.0,360.0,1.0,3,0.0
364,1.0,2.0,1,2.0,3250,1993,126.0,360.0,1.0,2,0.0
365,1.0,1.0,1,2.0,5000,2393,158.0,360.0,1.0,1,0.0


#### okay, we are set, we can go to the 
## Models!

In [26]:
#let's divide in X and y. Since we are going to predict the "Loan_status", let's take it out
X = train.drop(["Loan_Status"], axis = 1)
X

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Dependents
0,1.0,2.0,1,2.0,5849,0.0,128.0,360.0,1.0,3,0.0
1,1.0,1.0,1,2.0,4583,1508.0,128.0,360.0,1.0,1,1.0
2,1.0,1.0,1,1.0,3000,0.0,66.0,360.0,1.0,3,0.0
3,1.0,1.0,2,2.0,2583,2358.0,120.0,360.0,1.0,3,0.0
4,1.0,2.0,1,2.0,6000,0.0,141.0,360.0,1.0,3,0.0
...,...,...,...,...,...,...,...,...,...,...,...
609,2.0,2.0,1,2.0,2900,0.0,71.0,360.0,1.0,1,0.0
610,1.0,1.0,1,2.0,4106,0.0,40.0,180.0,1.0,1,3.0
611,1.0,1.0,1,2.0,8072,240.0,253.0,360.0,1.0,3,1.0
612,1.0,1.0,1,2.0,7583,0.0,187.0,360.0,1.0,3,2.0


In [27]:
y = train[["Loan_Status"]]
y

Unnamed: 0,Loan_Status
0,1
1,0
2,1
3,1
4,1
...,...
609,1
610,1
611,1
612,1


In [28]:
#Divide the train data set into train and test to teach and test the models
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size =0.03, random_state = 42)

Let's try many different models first

In [29]:
#We will use 6 different models with random sets of hyperparameters 
#and will pass all of them in a dictionary that will be inside a lsit
clfs = [
    (
        KNeighborsClassifier(n_jobs= -1),
        {'n_neighbors' : [5, 6, 7],
         'weights': ['uniform', 'distance']
        }
    ),
    (
      XGBClassifier(n_jobs= -1, random_state= 42),
        {'learning_rate' : [0.002, 0.001, 0.01],
         'max_depth' : [5, 10, 15, 20],
         'n_estimators' : [7000, 6500, 7500],
         'reg_alpha' : [0.9, 0.8, 1],
         'reg_lambda' : [0.9, 0.8, 1],
         'subsample' : [0.9, 0.8, 1],
         'metric_period' : [50, 100, 50]       
        }
    ),
    (
        LGBMClassifier(n_jobs = -1, random_state = 42),
        {'learning_rate': [0.002, 0.0045, 0.02],
         'num_iteration': [10000, 9000, 11000],
         'n_estimators' : [50, 100, 150, 200],
         'boosting_type' : ['gbdt', 'dart', 'goss'],
         'lambda_l1': [4.6, 5, 6],
         'lambda_l2': [1.9, 2, 3],
         'num_leaves' : [50, 102, 150],
         'min_child_samples' : [10, 20, 30]
         }
    ),
    (
       GradientBoostingClassifier(),
        {'learning_rate':[0.01, 0.02, 0.05],
         'n_estimators' : [5000, 6000, 7000],
         'max_depth' :[5, 10, 15, 20],
         'subsample' : [0.7, 0.8, 0.9]
        }    
    ),
    (
        MLPClassifier(random_state= 42),
        {'hidden_layer_sizes' : [50, 100, 200],
         'activation' : ['identity', 'logistic', 'tanh', 'relu'],
         'solver': ['lbfgs', 'sgd', 'adam'],
         'alpha' : [0.002, 0.0001, 0.01],
         'learning_rate' : ['constant', 'invscaling', 'adaptive'],
         'learning_rate_init' : [0.002, 0.005, 0.01, 0.1],
         'max_iter' : [100, 500, 1000],
         'momentum' : [0.7, 0.64, 0.8, 0.9]  
        }  
    )
]

In [30]:
#Let's use stratified cross validation for improving our score.
stra = StratifiedKFold(n_splits= 5, random_state= 42)

In [31]:
clfs_tuned = []  
for clf, param_grid in tqdm(clfs):
    start = time.time()
    iterations =  15 if clfs in ['GradientBoostingClassifier'] else 30 
    rand_search = RandomizedSearchCV(clf, param_grid,  n_iter= iterations, random_state=42,
                                     scoring='roc_auc', return_train_score= True,
                                     cv= stra, n_jobs=-1)
    rand_search.fit(x_train, y_train)
    clf_name = type(clf).__name__
    clf_score = rand_search.score(x_test, y_test)
    print('{:30s} {:30f} {:.1f}'.format(clf_name, clf_score, time.time() - start))
    clfs_tuned.append((clf_name, rand_search.best_params_, clf_score)) #storing the name of the model, 
                                                                        #best hyperparameters and score

 20%|████████████████▊                                                                   | 1/5 [00:07<00:30,  7.61s/it]

KNeighborsClassifier                                 0.750000 7.6


 40%|█████████████████████████████████▏                                                 | 2/5 [06:40<06:09, 123.25s/it]

XGBClassifier                                        0.958333 393.1


 60%|█████████████████████████████████████████████████▊                                 | 3/5 [10:12<04:59, 149.95s/it]

LGBMClassifier                                       0.770833 212.3


 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [37:58<10:04, 604.67s/it]

GradientBoostingClassifier                           0.958333 1665.7


100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [38:12<00:00, 458.53s/it]

MLPClassifier                                        0.604167 14.0





In [36]:
#becaus we are going to get a table to check the best results, let's make in a way so we see all the content of the table, 
#by setting the max display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

In [37]:
pd.DataFrame(clfs_tuned)

Unnamed: 0,0,1,2
0,KNeighborsClassifier,"{'weights': 'distance', 'n_neighbors': 7}",0.75
1,XGBClassifier,"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0.9, 'n_estimators': 6500, 'metric_period': 50, 'max_depth': 5, 'learning_rate': 0.002}",0.958333
2,LGBMClassifier,"{'num_leaves': 102, 'num_iteration': 10000, 'n_estimators': 50, 'min_child_samples': 30, 'learning_rate': 0.002, 'lambda_l2': 1.9, 'lambda_l1': 6, 'boosting_type': 'goss'}",0.770833
3,GradientBoostingClassifier,"{'subsample': 0.7, 'n_estimators': 5000, 'max_depth': 15, 'learning_rate': 0.01}",0.958333
4,MLPClassifier,"{'solver': 'sgd', 'momentum': 0.7, 'max_iter': 100, 'learning_rate_init': 0.01, 'learning_rate': 'invscaling', 'hidden_layer_sizes': 200, 'alpha': 0.0001, 'activation': 'logistic'}",0.604167


    Now that we now the best models we can construct and fit the best model

In [38]:
best_model = XGBClassifier(subsample =  0.8, reg_lambda = 1, reg_alpha = 0.9, n_estimators= 6500, 
                           metric_period= 50, max_depth= 5, learning_rate= 0.002)
best_model.fit(x_train, y_train)
prediction = best_model.predict(x_test)

#### Again, I found the kernel from Rabinovich quite useful with this little piece of code that I didn't know of:

In [40]:
print(classification_report(y_test, prediction))
XGB_report = accuracy_score(prediction, y_test)
print(f"{round(XGB_report*100,2)}% Accurate")

              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.94      0.94      0.94        16

    accuracy                           0.89        19
   macro avg       0.80      0.80      0.80        19
weighted avg       0.89      0.89      0.89        19

89.47% Accurate


#### Thanks for this small piece of code to get a confusion matrix and accuracy so easily!

It appears we are doing some over-fitting with our model. We will have to check that later. For now, we have an almost 90% accuracy model. 

## Export

In [41]:
output = pd.DataFrame.from_dict([{'y_test':y_test, 'prediction': prediction}]) #due to the nature of y_test we pass it as a list inside dict
output.to_csv('prediction.csv', index=False)