# Load and Import Library




In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os


In [None]:
# Load the dataset
url1 = "https://docs.google.com/spreadsheets/d/1Uvpal_TZdr6dEvScP5ulhaICI3MGgwEv/export?format=xlsx"
url2 = "https://docs.google.com/spreadsheets/d/1RxIn0q6vRET9185H2Bvf3r1mG75--oYc/export?format=xlsx"

a1 = pd.read_excel(url1)
a2 = pd.read_excel(url2)


# Data Processing / Data Cleaning

In [None]:
df1=a1.copy()
df2=a2.copy()

In [None]:
df1.info()
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PROSPECTID            51336 non-null  int64  
 1   Total_TL              51336 non-null  int64  
 2   Tot_Closed_TL         51336 non-null  int64  
 3   Tot_Active_TL         51336 non-null  int64  
 4   Total_TL_opened_L6M   51336 non-null  int64  
 5   Tot_TL_closed_L6M     51336 non-null  int64  
 6   pct_tl_open_L6M       51336 non-null  float64
 7   pct_tl_closed_L6M     51336 non-null  float64
 8   pct_active_tl         51336 non-null  float64
 9   pct_closed_tl         51336 non-null  float64
 10  Total_TL_opened_L12M  51336 non-null  int64  
 11  Tot_TL_closed_L12M    51336 non-null  int64  
 12  pct_tl_open_L12M      51336 non-null  float64
 13  pct_tl_closed_L12M    51336 non-null  float64
 14  Tot_Missed_Pmnt       51336 non-null  int64  
 15  Auto_TL            

In [None]:
df1.sample(3)

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL
13878,13879,4,3,1,0,0,0.0,0.0,0.25,0.75,...,0,3,0,0,0,1,3,0,44,9
20175,20176,1,1,0,0,0,0.0,0.0,0.0,1.0,...,0,1,0,0,0,0,1,0,17,17
7710,7711,1,0,1,0,0,0.0,0.0,1.0,0.0,...,0,0,0,0,1,0,1,0,12,12


In [None]:
df2.sample(3)

Unnamed: 0,PROSPECTID,time_since_recent_payment,time_since_first_deliquency,time_since_recent_deliquency,num_times_delinquent,max_delinquency_level,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,...,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,max_unsec_exposure_inPct,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
26726,26727,95,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,-99999.0,0,0,ConsumerLoan,AL,683,P2
1597,1598,99,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.667,0.0,1.12,0,0,PL,ConsumerLoan,682,P2
10013,10014,81,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,6.667,0,0,ConsumerLoan,ConsumerLoan,657,P4


### `Remove Column `

In [None]:
df1.loc[df1["Age_Oldest_TL"] != -99999].shape

(51296, 26)

In [None]:
remove_col1=[]
for i in df1.columns:
  if df1.loc[df1[i] == -99999].shape[0] > 10000:
    remove_col1.append(i)
remove_col1

[]

In [None]:
remove_col2=[]
for i in df2.columns:
  if df2.loc[df2[i] == -99999].shape[0] > 10000:
    remove_col2.append(i)
remove_col2

['time_since_first_deliquency',
 'time_since_recent_deliquency',
 'max_delinquency_level',
 'max_deliq_6mts',
 'max_deliq_12mts',
 'CC_utilization',
 'PL_utilization',
 'max_unsec_exposure_inPct']

In [None]:
df1 = df1.drop(remove_col1, axis =1)
df2 = df2.drop(remove_col2, axis =1)

In [None]:
df1.shape

(51336, 26)

In [None]:
df2.shape

(51336, 54)

## Marge DataFrame

In [None]:
for i in df1.columns:
  if i in df2.columns:
    print(i)

PROSPECTID


In [None]:
df=pd.merge(df1,df2,how="inner",left_on=["PROSPECTID"],right_on=["PROSPECTID"])

In [None]:
df

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,5,4,1,0,0,0.000,0.00,0.200,0.800,...,0.0,0.0,0.0,0.0,1,0,PL,PL,696,P2
1,2,1,0,1,0,0,0.000,0.00,1.000,0.000,...,0.0,0.0,0.0,0.0,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,8,0,8,1,0,0.125,0.00,1.000,0.000,...,0.0,0.0,0.0,0.0,1,0,ConsumerLoan,others,693,P2
3,4,1,0,1,1,0,1.000,0.00,1.000,0.000,...,0.0,0.0,0.0,0.0,0,0,others,others,673,P2
4,5,3,2,1,0,0,0.000,0.00,0.333,0.667,...,0.0,0.0,0.0,0.0,0,0,AL,AL,753,P1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51331,51332,3,0,3,1,0,0.333,0.00,1.000,0.000,...,0.0,0.0,0.0,0.0,0,0,ConsumerLoan,ConsumerLoan,650,P4
51332,51333,4,2,2,0,1,0.000,0.25,0.500,0.500,...,0.0,0.0,0.0,0.0,0,0,others,others,702,P1
51333,51334,2,1,1,1,1,0.500,0.50,0.500,0.500,...,1.0,0.0,1.0,0.0,0,0,ConsumerLoan,others,661,P3
51334,51335,2,1,1,0,0,0.000,0.00,0.500,0.500,...,0.0,0.0,0.0,0.0,0,0,ConsumerLoan,others,686,P2


In [None]:
df.isna().sum().sum()

0

## Colineartity & MultiColinerity

In [None]:
categorical_col=[]
for i in df.columns:
  if df[i].dtype=="object":
    if i != 'Approved_Flag':
      categorical_col.append(i)
categorical_col

['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']

In [None]:

# Chi-square test
for i in categorical_col:
    chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i], df['Approved_Flag']))
    print(i, '---', pval)

MARITALSTATUS --- 2.7588304433709322e-257
EDUCATION --- 8.464676085080213e-38
GENDER --- 0.0002450667989541709
last_prod_enq2 --- 0.0
first_prod_enq2 --- 0.0


In [None]:

# VIF for numerical columns
numeric_columns = []
for i in df.columns:
    if df[i].dtype != 'object' and i not in ['PROSPECTID','Approved_Flag']:
        numeric_columns.append(i)

In [None]:
# VIF sequentially check

vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0



for i in range (0,total_columns):

    vif_value = variance_inflation_factor(vif_data, column_index)
    print (column_index,'---',vif_value)


    if vif_value <= 6:
        columns_to_be_kept.append( numeric_columns[i] )
        column_index = column_index+1

    else:
        vif_data = vif_data.drop([ numeric_columns[i] ] , axis=1)


  vif = 1. / (1. - r_squared_i)


0 --- inf


  vif = 1. / (1. - r_squared_i)


0 --- inf
0 --- 10.96068589191104
0 --- 8.24461560613237
0 --- 6.428613717880285
0 --- 5.451510401660112
1 --- 2.4898053449118254


  vif = 1. / (1. - r_squared_i)


2 --- inf
2 --- 1778.9062772903958
2 --- 8.232558641419335
2 --- 3.6593447956743064
3 --- 5.183256544583922
4 --- 4.652174265990606
5 --- 2.001906321371565


  vif = 1. / (1. - r_squared_i)


6 --- inf
6 --- 4.785432356660499
7 --- 21.056776779452655
7 --- 32.01737181129891
7 --- 4.45184259488371
8 --- 2.9983391165480078
9 --- 2.7694192997303704
10 --- 4.066854752318726
11 --- 2.1800916362719684
12 --- 9675.573576511995
12 --- 1.0125588353417991
13 --- 1.274596130532195
14 --- 8.003796745409083
14 --- 5.3887639499912545


  vif = 1. / (1. - r_squared_i)


15 --- inf
15 --- 7.270722749921508
15 --- 1.4144356222819836
16 --- 8.503496122471631
16 --- 1.6273985406305966
17 --- 7.052949078874394
17 --- 15.403170327589034
17 --- 1.817343402254699
18 --- 1.5629944959731699
19 --- 2.545972180934038
20 --- 3.1032035213035476
21 --- 2.1910641812592044
22 --- 7.383083673663966
22 --- 2.0740907035010894
23 --- 2.7229459690916533
24 --- 6.292803558762637
24 --- 2.7094129524003265
25 --- 4.921818064568873
26 --- 342823420.6794107
26 --- 2990944434.3965917
26 --- 11947186901.945559
26 --- 1411445580.8237245
26 --- 1785758392.2029078
26 --- 5337691287.578271
26 --- 492798648.9475172
26 --- 8272.414182009576
26 --- 667708491.3666091
26 --- 783622576.9085875
26 --- 1.306845533263155
27 --- 21.171169488329728
27 --- 2.868921831811679
28 --- 3.3466179463308707
29 --- 7.809714834107825
29 --- 6.303310933645005
29 --- 1.005861451692548
30 --- 3.0663217183277425
31 --- 2.7189910179442407
32 --- 20.731544700032806
32 --- 16.175674449378114
32 --- 1.43204510381

In [None]:
# check Anova for columns_to_be_kept

from scipy.stats import f_oneway

columns_to_be_kept_numerical = []

for i in columns_to_be_kept:
    a = list(df[i])
    b = list(df['Approved_Flag'])

    group_P1 = [value for value, group in zip(a, b) if group == 'P1']
    group_P2 = [value for value, group in zip(a, b) if group == 'P2']
    group_P3 = [value for value, group in zip(a, b) if group == 'P3']
    group_P4 = [value for value, group in zip(a, b) if group == 'P4']


    f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)

    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(i)


In [None]:
len(columns_to_be_kept_numerical)

33

In [None]:
df.shape

(51336, 79)

In [None]:
# listing all the final features
features = columns_to_be_kept_numerical + categorical_col
df = df[features + ['Approved_Flag']]
df.shape

(51336, 39)

##level Encodeing & One hot Encodeing

In [None]:
# Label encoding for the categorical features


for i in categorical_col:
  k=df[i].unique()
  print(k)



['Married' 'Single']
['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']
['M' 'F']
['PL' 'ConsumerLoan' 'others' 'AL' 'CC' 'HL']
['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']


In [None]:
# Ordinal feature -- EDUCATION
# SSC            : 1
# 12TH           : 2
# GRADUATE       : 3
# UNDER GRADUATE : 3
# POST-GRADUATE  : 4
# OTHERS         : 1
# PROFESSIONAL   : 3


# Others has to be verified by the business end user




df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']]              = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']]             = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']]         = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]   = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]    = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']]           = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]     = 3



In [None]:
df['EDUCATION'].value_counts()
df['EDUCATION'] = df['EDUCATION'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 39 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            51336 non-null  float64
 1   pct_tl_closed_L6M          51336 non-null  float64
 2   Tot_TL_closed_L12M         51336 non-null  int64  
 3   pct_tl_open_L12M           51336 non-null  float64
 4   pct_tl_closed_L12M         51336 non-null  float64
 5   Tot_Missed_Pmnt            51336 non-null  int64  
 6   CC_TL                      51336 non-null  int64  
 7   Home_TL                    51336 non-null  int64  
 8   PL_TL                      51336 non-null  int64  
 9   Secured_TL                 51336 non-null  int64  
 10  Unsecured_TL               51336 non-null  int64  
 11  Other_TL                   51336 non-null  int64  
 12  Age_Newest_TL              51336 non-null  int64  
 13  time_since_recent_payment  51336 non-null  int

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EDUCATION'] = df['EDUCATION'].astype(int)


In [None]:

df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'])


In [None]:
df_encoded.info()
k = df_encoded.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 51 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               51336 non-null  float64
 1   pct_tl_closed_L6M             51336 non-null  float64
 2   Tot_TL_closed_L12M            51336 non-null  int64  
 3   pct_tl_open_L12M              51336 non-null  float64
 4   pct_tl_closed_L12M            51336 non-null  float64
 5   Tot_Missed_Pmnt               51336 non-null  int64  
 6   CC_TL                         51336 non-null  int64  
 7   Home_TL                       51336 non-null  int64  
 8   PL_TL                         51336 non-null  int64  
 9   Secured_TL                    51336 non-null  int64  
 10  Unsecured_TL                  51336 non-null  int64  
 11  Other_TL                      51336 non-null  int64  
 12  Age_Newest_TL                 51336 non-null  int64  
 13  t

# Model Building

In [None]:
# 1. Random Forest

y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )




x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)





rf_classifier = RandomForestClassifier(n_estimators = 200, random_state=42)





rf_classifier.fit(x_train, y_train)



y_pred = rf_classifier.predict(x_test)





accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy}')
print ()
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)


for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()



Accuracy: 0.737923646279704

Class p1:
Precision: 0.7700617283950617
Recall: 0.439647577092511
F1 Score: 0.559730790802019

Class p2:
Precision: 0.762936528167219
Recall: 0.9383915974290641
F1 Score: 0.8416168717047452

Class p3:
Precision: 0.44901315789473684
Recall: 0.177734375
F1 Score: 0.25466417910447764

Class p4:
Precision: 0.7024013722126929
Recall: 0.6724137931034483
F1 Score: 0.6870805369127517



In [None]:
# 2. xgboost

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)



y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)




xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy:.2f}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()



Accuracy: 0.74

Class p1:
Precision: 0.7430025445292621
Recall: 0.5145374449339207
F1 Score: 0.6080166579906299

Class p2:
Precision: 0.7847036839295248
Recall: 0.9216178084339238
F1 Score: 0.8476677961214043

Class p3:
Precision: 0.43337334933973587
Recall: 0.23502604166666666
F1 Score: 0.3047699451245251

Class p4:
Precision: 0.7113223854796888
Recall: 0.6756978653530378
F1 Score: 0.6930526315789474



In [None]:
# 3. Decision Tree
from sklearn.tree import DecisionTreeClassifier


y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f"Accuracy: {accuracy:.2f}")
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()




Accuracy: 0.67

Class p1:
Precision: 0.5123513266239708
Recall: 0.4933920704845815
F1 Score: 0.5026929982046678

Class p2:
Precision: 0.7711902279159839
Recall: 0.8115692114751528
F1 Score: 0.7908646501680415

Class p3:
Precision: 0.3316618911174785
Recall: 0.3014322916666667
F1 Score: 0.3158253751705321

Class p4:
Precision: 0.6416510318949343
Recall: 0.5615763546798029
F1 Score: 0.5989492119089317



## Hyperparameter Tuning

In [62]:
df_encoded.columns

Index(['pct_tl_open_L6M', 'pct_tl_closed_L6M', 'Tot_TL_closed_L12M',
       'pct_tl_open_L12M', 'pct_tl_closed_L12M', 'Tot_Missed_Pmnt', 'CC_TL',
       'Home_TL', 'PL_TL', 'Secured_TL', 'Unsecured_TL', 'Other_TL',
       'Age_Newest_TL', 'time_since_recent_payment',
       'max_recent_level_of_deliq', 'num_deliq_6_12mts', 'num_times_60p_dpd',
       'num_std_12mts', 'num_sub', 'num_sub_12mts', 'num_dbt', 'num_dbt_12mts',
       'num_lss', 'recent_level_of_deliq', 'enq_L3m', 'NETMONTHLYINCOME',
       'Time_With_Curr_Empr', 'CC_Flag', 'PL_Flag', 'pct_PL_enq_L6m_of_ever',
       'pct_CC_enq_L6m_of_ever', 'HL_Flag', 'GL_Flag', 'EDUCATION',
       'Approved_Flag', 'MARITALSTATUS_Married', 'MARITALSTATUS_Single',
       'GENDER_F', 'GENDER_M', 'last_prod_enq2_AL', 'last_prod_enq2_CC',
       'last_prod_enq2_ConsumerLoan', 'last_prod_enq2_HL', 'last_prod_enq2_PL',
       'last_prod_enq2_others', 'first_prod_enq2_AL', 'first_prod_enq2_CC',
       'first_prod_enq2_ConsumerLoan', 'first_prod_e

In [None]:
df_encoded.sample(3)

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_open_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
15133,0.0,0.0,0,1.0,0.0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,False,True
34647,0.0,0.0,0,0.0,0.0,0,0,0,0,1,...,False,False,False,True,False,False,False,False,False,True
30947,0.333,0.0,0,1.0,0.0,3,0,0,1,2,...,True,False,False,False,False,False,False,False,False,True


In [None]:
# Apply standard scaler

"""from sklearn.preprocessing import StandardScaler

columns_to_be_scaled = ['Age_Oldest_TL','Age_Newest_TL','time_since_recent_payment',
'max_recent_level_of_deliq','recent_level_of_deliq',
'time_since_recent_enq','NETMONTHLYINCOME','Time_With_Curr_Empr']

for i in columns_to_be_scaled:
    column_data = df_encoded[i].values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled_column = scaler.fit_transform(column_data)
    df_encoded[i] = scaled_column """


"from sklearn.preprocessing import StandardScaler\n\ncolumns_to_be_scaled = ['Age_Oldest_TL','Age_Newest_TL','time_since_recent_payment',\n'max_recent_level_of_deliq','recent_level_of_deliq',\n'time_since_recent_enq','NETMONTHLYINCOME','Time_With_Curr_Empr']\n\nfor i in columns_to_be_scaled:\n    column_data = df_encoded[i].values.reshape(-1, 1)\n    scaler = StandardScaler()\n    scaled_column = scaler.fit_transform(column_data)\n    df_encoded[i] = scaled_column "

In [None]:
# Hyperparameter tuning in xgboost
from sklearn.model_selection import GridSearchCV
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

# Define the XGBClassifier with the initial set of hyperparameters
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

# Define the parameter grid for hyperparameter tuning

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)


In [None]:
# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the model with the best hyperparameters on the test set
best_model = grid_search.best_estimator_
accuracy = best_model.score(x_test, y_test)
print("Test Accuracy:", accuracy)


Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100}
Test Accuracy: 0.7456174522789248


In [52]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd

# Define the hyperparameter grid
param_grid = {
    'colsample_bytree': [0.3,0.5, 0.7],
    'learning_rate': [0.001, 0.01, 0.1],
    'max_depth': [3,5, 7, 9],
    'alpha': [10,50, 100],
    'n_estimators': [20,60, 100]
}

index = 0

# Initialize lists to collect results
results = {
    'combination': [],
    'train_Accuracy': [],
    'test_Accuracy': [],
    'colsample_bytree': [],
    'learning_rate': [],
    'max_depth': [],
    'alpha': [],
    'n_estimators': []
}

# Define the target and features
y = df_encoded['Approved_Flag']
x = df_encoded.drop(['Approved_Flag'], axis=1)

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)
# Loop through each combination of hyperparameters
for colsample_bytree in param_grid['colsample_bytree']:
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            for alpha in param_grid['alpha']:
                for n_estimators in param_grid['n_estimators']:
                    index += 1

                    # Define and train the XGBoost model
                    model = xgb.XGBClassifier(objective='multi:softmax',
                                              num_class=4,
                                              colsample_bytree=colsample_bytree,
                                              learning_rate=learning_rate,
                                              max_depth=max_depth,
                                              alpha=alpha,
                                              n_estimators=n_estimators)

                    model.fit(x_train, y_train)

                    # Predict on training and testing sets
                    y_pred_train = model.predict(x_train)
                    y_pred_test = model.predict(x_test)

                    # Calculate train and test accuracies
                    train_accuracy = accuracy_score(y_train, y_pred_train)
                    test_accuracy = accuracy_score(y_test, y_pred_test)

                    # Include results into the lists
                    results['combination'].append(index)
                    results['train_Accuracy'].append(train_accuracy)
                    results['test_Accuracy'].append(test_accuracy)
                    results['colsample_bytree'].append(colsample_bytree)
                    results['learning_rate'].append(learning_rate)
                    results['max_depth'].append(max_depth)
                    results['alpha'].append(alpha)
                    results['n_estimators'].append(n_estimators)

                    # Print results for this combination
                    print(f"Combination {index}")
                    print(f"colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth: {max_depth}, alpha: {alpha}, n_estimators: {n_estimators}")
                    print(f"Train Accuracy: {train_accuracy:.2f}")
                    print(f"Test Accuracy : {test_accuracy:.2f}")
                    print("-" * 30)

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Display the DataFrame
print(results_df)

"""# Loop through each combination of hyperparameters
for colsample_bytree in param_grid['colsample_bytree']:
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            for alpha in param_grid['alpha']:
                for n_estimators in param_grid['n_estimators']:
                    index += 1"""
                    # Define and train the XGBoost model
"""xg_cv = xgb.XGBClassifier(objective='multi:softmax',
                                              num_class=4,
                                              colsample_bytree=colsample_bytree,
                                              learning_rate=learning_rate,
                                              max_depth=max_depth,
                                              alpha=alpha,
                                              n_estimators=n_estimators)
xg_cv = RandomizedSearchCV(xg_cv, param_grid, cv=5)
xg_cv.fit(x_train, y_train)
print("Tuned Decision Tree Parameters: {}".format(xg_cv.best_params_))
print("Best score is {}".format(xg_cv.best_score_))"""



Combination 1
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 10, n_estimators: 20
Train Accuracy: 0.64
Test Accuracy : 0.64
------------------------------
Combination 2
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 10, n_estimators: 60
Train Accuracy: 0.65
Test Accuracy : 0.65
------------------------------
Combination 3
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 10, n_estimators: 100
Train Accuracy: 0.65
Test Accuracy : 0.65
------------------------------
Combination 4
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 50, n_estimators: 20
Train Accuracy: 0.64
Test Accuracy : 0.63
------------------------------
Combination 5
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 50, n_estimators: 60
Train Accuracy: 0.65
Test Accuracy : 0.64
------------------------------
Combination 6
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 50, n_estimators: 100
Train Accuracy: 0.65
Test A

'xg_cv = xgb.XGBClassifier(objective=\'multi:softmax\',\n                                              num_class=4,\n                                              colsample_bytree=colsample_bytree,\n                                              learning_rate=learning_rate,\n                                              max_depth=max_depth,\n                                              alpha=alpha,\n                                              n_estimators=n_estimators)\nxg_cv = RandomizedSearchCV(xg_cv, param_grid, cv=5)\nxg_cv.fit(x_train, y_train)\nprint("Tuned Decision Tree Parameters: {}".format(xg_cv.best_params_))\nprint("Best score is {}".format(xg_cv.best_score_))'

In [58]:
file_name = 'Results.xlsx'


results_df.to_excel(file_name)
print('DataFrames are written to Excel File successfully.')

DataFrames are written to Excel File successfully.


In [59]:
# best combination
#Combination 309
#colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 7, alpha: 10, n_estimators: 100
#Train Accuracy: 0.78
#Test Accuracy : 0.75

In [60]:
model = xgb.XGBClassifier(objective='multi:softmax',
                                              num_class=4,
                                              colsample_bytree=0.7,
                                              learning_rate=0.1,
                                              max_depth=7,
                                              alpha=10,
                                              n_estimators=100)
model.fit(x_train, y_train)

                    # Predict on training and testing sets
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

                    # Calculate train and test accuracies
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"train_accuracy :{train_accuracy},test_accuracy : {test_accuracy}")

train_accuracy :0.7774179409759423,test_accuracy : 0.7464939618231399


In [61]:
#Accuracy not changeing when we change parameter