# Final model

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score as rsq
import xgboost as xgb
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder
from feature_engine.categorical_encoders import CountFrequencyCategoricalEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [3]:
# reading the data(this data set not conataing null becoz we have imputed null vallues using cluster imputation)
data=pd.read_csv(r"data_no_null.csv",encoding='latin1')# reading the data

In [4]:
# list of columns not necessary for prediction
cols_to_drop=['Unnamed: 0','sub_grade','State','Emp_designation','last_week_pay']
# dropping the unnecessary columns
df_xg=data.drop(columns=cols_to_drop,axis=1)

In [None]:
#df_xg

In [5]:
categorical = [col for col in df_xg.columns if df_xg[col].dtypes == 'O']

categorical

['terms',
 'grade',
 'home_ownership',
 'verification_status',
 'purpose',
 'initial_list_status',
 'application_type',
 'Experience']

In [6]:
df_xg[['purpose']] = df_xg[['purpose']].replace(['car','house','renewable_energy','wedding','vacation','moving','medical','educational'],
                                              ['major_purchase','major_purchase','small_business','other','other','other','medical(or)education','medical(or)education'])

In [7]:
df_xg.purpose.value_counts().sort_values()

medical(or)education      8963
small_business           10952
major_purchase           29847
home_improvement         51829
other                    55391
credit_card             206182
debt_consolidation      524215
Name: purpose, dtype: int64

In [8]:
ordinal_enc = CountFrequencyCategoricalEncoder(
    encoding_method='frequency',
    variables=categorical)

df_xg_fre_enc = ordinal_enc.fit_transform(df_xg)

In [9]:
X_xg_f=df_xg_fre_enc[['loan_amnt ', 'terms', 'Rate_of_intrst', 'grade', 'home_ownership',
       'annual_inc', 'verification_status', 'purpose', 'debt_income_ratio',
       'delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec',
       'total_credits', 'initial_list_status',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'collections_12_mths_ex_med',
       'application_type', 'acc_now_delinq', 'Experience',
       'mths_since_last_delinq', 'tot_curr_bal', 'tot_colle_amt']]
Y_xg_f=df_xg_fre_enc[['total revol_bal']]

In [10]:
# let's separate into training and testing set

X_train_xg_f, X_test_xg_f, y_train_xg_f, y_test_xg_f = train_test_split(X_xg_f  ,  # predictors
                                                    Y_xg_f,  # target
    test_size=0.2,  # percentage of obs in test set
    random_state=2)  # seed to ensure reproducibility

X_train_xg_f.shape, X_test_xg_f.shape

((709903, 26), (177476, 26))

In [11]:
xgb_mod_f=xgb.XGBRegressor()
params_f={
        'learning_rate':[0.03,0.05,0.08,0.10,0.15,0.20,0.25,0.30],
        'max_depth':[3,4,5,6,8,10,12,15,20,25],
        'min_child_weight':[1,3,5,7],
        'gamma':[0.0,0.1,0.2,0.3,0.4]
        }
random_search_f=RandomizedSearchCV(xgb_mod_f,param_distributions=params_f,n_iter=5,n_jobs=-1,cv=5,verbose=3)
random_search_f.fit(X_train_xg_f,y_train_xg_f)
random_search_f.best_estimator_
random_search_f.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed: 25.5min remaining:  8.1min
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 33.9min finished


{'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 0.0}

In [48]:
xgb_mod_f=xgb.XGBRegressor(min_child_weight=5, max_depth=8, learning_rate=0.05, gamma=0.2)
xgb_mod_f.fit(X_train_xg_f,y_train_xg_f)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.2, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=8,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
xg_pred_train_f=xgb_mod_f.predict(X_train_xg_f) 
xg_train_f_r2=rsq(y_train_xg_f,xg_pred_train_f)            
xg_train_f_r2

0.5071931802986467

In [14]:
xg_pred_test_f=xgb_mod_f.predict(X_test_xg_f)
xg_test_f_r2=rsq(y_test_xg_f,xg_pred_test_f)
xg_test_f_r2

0.44321916395818395

In [15]:
print("train R^2 :", xg_train_f_r2)
print("test R^2 :", xg_test_f_r2)

train R^2 : 0.5071931802986467
test R^2 : 0.44321916395818395


In [16]:
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error
print(' train RMSE :', np.sqrt(metrics.mean_squared_error(y_train_xg_f,xg_pred_train_f)))
print(' test RMSE :', np.sqrt(metrics.mean_squared_error(y_test_xg_f,xg_pred_test_f)))

 train RMSE : 15685.77820466817
 test RMSE : 16978.15613094253


In [17]:
k_xg_f=list(xgb_mod_f.feature_importances_)
j_xg_f=list(X_train_xg_f.columns)
data_xg_f ={'columns': j_xg_f, 'feature importance':k_xg_f}
d_xg_f=pd.DataFrame(data_xg_f)
d_xg_f

Unnamed: 0,columns,feature importance
0,loan_amnt,0.203006
1,terms,0.013979
2,Rate_of_intrst,0.018982
3,grade,0.064061
4,home_ownership,0.031795
5,annual_inc,0.07225
6,verification_status,0.030308
7,purpose,0.02399
8,debt_income_ratio,0.062628
9,delinq_2yrs,0.015159


#### so here i m droping the features whose importance is less than 1% :-
collection_recovery_fee

collections_12_mths_ex_med

application_type

acc_now_delinq

In [18]:
X_xg_f_1=df_xg_fre_enc[['loan_amnt ', 'terms', 'Rate_of_intrst', 'grade', 'home_ownership','annual_inc', 'verification_status',
                        'purpose', 'debt_income_ratio','delinq_2yrs', 'inq_last_6mths', 'numb_credit', 'pub_rec', 'total_credits', 
                        'initial_list_status','total_rec_int', 'total_rec_late_fee', 'recoveries','Experience','mths_since_last_delinq', 
                        'tot_curr_bal', 'tot_colle_amt']]
Y_xg_f_1=df_xg_fre_enc[['total revol_bal']]

In [131]:
# let's separate into training and testing set

X_train_xg_f_1, X_test_xg_f_1, y_train_xg_f_1, y_test_xg_f_1 = train_test_split(X_xg_f_1,# predictors
                                                    Y_xg_f_1, # target
    test_size=0.30,  # percentage of obs in test set
    random_state=2)  # seed to ensure reproducibility

X_train_xg_f_1.shape, X_test_xg_f_1.shape

((621165, 22), (266214, 22))

In [132]:
xgb_mod_f_1=xgb.XGBRegressor(min_child_weight=5, max_depth=8, learning_rate=0.05, gamma=0.2)
xgb_mod_f_1.fit(np.array(X_train_xg_f_1),np.array(y_train_xg_f_1))


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.2, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=8,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [133]:
xg_pred_train_f_1=xgb_mod_f_1.predict(np.array(X_train_xg_f_1)) 
xg_train_f_1_r2=rsq(y_train_xg_f_1,xg_pred_train_f_1)            

In [134]:
xg_pred_test_f_1=xgb_mod_f_1.predict(np.array(X_test_xg_f_1)) 
xg_test_f_1_r2=rsq(y_test_xg_f_1,xg_pred_test_f_1)            

In [135]:
print("train R^2 :", xg_train_f_1_r2)
print("test R^2 :", xg_test_f_1_r2)

train R^2 : 0.5170709398659196
test R^2 : 0.408121832094141


In [136]:
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error
print(' train RMSE :', np.sqrt(metrics.mean_squared_error(y_train_xg_f_1,xg_pred_train_f_1)))
print(' test RMSE :', np.sqrt(metrics.mean_squared_error(y_test_xg_f_1,xg_pred_test_f_1)))

 train RMSE : 15678.323911768333
 test RMSE : 17010.384081760752


In [123]:
k_xg_f_1=list(xgb_mod_f_1.feature_importances_)
j_xg_f_1=list(X_train_xg_f_1.columns)
data_xg_f_1 ={'columns': j_xg_f_1, 'feature importance':k_xg_f_1}
d_xg_f_1=pd.DataFrame(data_xg_f_1)
d_xg_f_1

Unnamed: 0,columns,feature importance
0,loan_amnt,0.205232
1,terms,0.018222
2,Rate_of_intrst,0.018566
3,grade,0.070319
4,home_ownership,0.029284
5,annual_inc,0.071329
6,verification_status,0.029433
7,purpose,0.022427
8,debt_income_ratio,0.060378
9,delinq_2yrs,0.014879


In [26]:
import pickle
pickle.dump(xgb_mod_f_1,open('finalized_model.pkl','wb'))

In [137]:
# prediction using the saved model.
loaded_model = pickle.load(open('finalized_model.pkl', 'rb'))
prediction=loaded_model.predict(np.array(X_test_xg_f_1))

print(prediction)

[27230.615 14566.864 52932.723 ... 18900.77  20298.232 23070.572]


In [138]:
df_csv = pd.DataFrame(columns = ['S.NO', 'total revol_bal'])
df_csv

Unnamed: 0,S.NO,total revol_bal


In [139]:
list_1 = []
len_pred = prediction.size +1
for x in range(1,len_pred):
    list_1.append(x)

In [140]:
df_csv['S.NO'] = list_1

In [141]:
df_csv['total revol_bal'] = prediction

In [142]:
df_csv

Unnamed: 0,S.NO,total revol_bal
0,1,27230.615234
1,2,14566.864258
2,3,52932.722656
3,4,17604.496094
4,5,7046.140137
...,...,...
266209,266210,7584.052246
266210,266211,10931.376953
266211,266212,18900.769531
266212,266213,20298.232422


In [145]:
df_csv.to_csv('final_prediction_csv.csv', index=False)

##### so above one is showing our saved model is working fine 

### Saving frequency-encoding of "terms" to disk ----- later used in the Deployment

In [146]:
a = CountFrequencyCategoricalEncoder(encoding_method='frequency',variables=['terms'])
dict_all = dict(zip([], []))
temp_keys = df_xg[['terms']].values
temp_values = a.fit_transform(df_xg[['terms']])
dict_temp = dict(zip(np.array(temp_keys).ravel(),np.array(temp_values).ravel()))
dict_all['terms'] = dict_temp
import pickle
filehandler = open("terms.obj","wb")
pickle.dump(dict_all,filehandler)
filehandler.close()

In [147]:
file = open("terms.obj",'rb')
terms = pickle.load(file)
file.close()
print(terms)

{'terms': {'36 months': 0.699954585357553, '60 months': 0.30004541464244705}}


### Saving frequency-encoding of "grade" to disk ----- later used in the Deployment

In [148]:
b = CountFrequencyCategoricalEncoder(encoding_method='frequency',variables=['grade'])
dict_all = dict(zip([], []))
temp_keys = df_xg[['grade']].values
temp_values = b.fit_transform(df_xg[['grade']])
dict_temp = dict(zip(np.array(temp_keys).ravel(),np.array(temp_values).ravel()))
dict_all['grade'] = dict_temp
import pickle
filehandler = open("grade.obj","wb")
pickle.dump(dict_all,filehandler)
filehandler.close()

In [149]:
file = open("grade.obj",'rb')
terms = pickle.load(file)
file.close()
print(terms)

{'grade': {'E': 0.07967846883913186, 'B': 0.2868391070782608, 'A': 0.16701093895618446, 'D': 0.15725186194399463, 'C': 0.27706312635300134, 'F': 0.025970864760153214, 'G': 0.00618563206927367}}


### Saving frequency-encoding of "home_ownership" to disk ----- later used in the Deployment

In [150]:
c = CountFrequencyCategoricalEncoder(encoding_method='frequency',variables=['home_ownership'])
dict_all = dict(zip([], []))
temp_keys = df_xg[['home_ownership']].values
temp_values = c.fit_transform(df_xg[['home_ownership']])
dict_temp = dict(zip(np.array(temp_keys).ravel(),np.array(temp_values).ravel()))
dict_all['home_ownership'] = dict_temp
import pickle
filehandler = open("home_ownership.obj","wb")
pickle.dump(dict_all,filehandler)
filehandler.close()

In [151]:
file = open("home_ownership.obj",'rb')
terms = pickle.load(file)
file.close()
print(terms)

{'home_ownership': {'OWN': 0.09857118547993586, 'MORTGAGE': 0.49985068386788506, 'RENT': 0.4013133058140885, 'OTHER': 0.00020509838524463618, 'NONE': 5.6345710232042906e-05, 'ANY': 3.3807426139225743e-06}}


In [152]:
df_xg.home_ownership.value_counts().sort_index()

ANY              3
MORTGAGE    443557
NONE            50
OTHER          182
OWN          87470
RENT        356117
Name: home_ownership, dtype: int64

### Saving frequency-encoding of "verification_status" to disk ----- later used in the Deployment

In [153]:
d = CountFrequencyCategoricalEncoder(encoding_method='frequency',variables=['verification_status'])
dict_all = dict(zip([], []))
temp_keys = df_xg[['verification_status']].values
temp_values = d.fit_transform(df_xg[['verification_status']])
dict_temp = dict(zip(np.array(temp_keys).ravel(),np.array(temp_values).ravel()))
dict_all['verification_status'] = dict_temp
import pickle
filehandler = open("verification_status.obj","wb")
pickle.dump(dict_all,filehandler)
filehandler.close()

In [154]:
file = open("verification_status.obj",'rb')
terms = pickle.load(file)
file.close()
print(terms)

{'verification_status': {'Source Verified': 0.3713835914530319, 'Not Verified': 0.3006043640879489, 'Verified': 0.3280120444590192}}


In [155]:
df_xg.verification_status.value_counts().sort_index()

Not Verified       266750
Source Verified    329558
Verified           291071
Name: verification_status, dtype: int64

### Saving frequency-encoding of "purpose" to disk ----- later used in the Deployment

In [156]:
e = CountFrequencyCategoricalEncoder(encoding_method='frequency',variables=['purpose'])
dict_all = dict(zip([], []))
temp_keys = df_xg[['purpose']].values
temp_values = e.fit_transform(df_xg[['purpose']])
dict_temp = dict(zip(np.array(temp_keys).ravel(),np.array(temp_values).ravel()))
dict_all['purpose'] = dict_temp
import pickle
filehandler = open("purpose.obj","wb")
pickle.dump(dict_all,filehandler)
filehandler.close()

In [157]:
file = open("purpose.obj",'rb')
terms = pickle.load(file)
file.close()
print(terms)

{'purpose': {'debt_consolidation': 0.5907453297858074, 'home_improvement': 0.05840683631233103, 'credit_card': 0.2323494245412614, 'other': 0.06242090470926177, 'major_purchase': 0.03363500826591569, 'small_business': 0.012341964369226677, 'medical(or)education': 0.01010053201619601}}


In [158]:
df_xg.purpose.value_counts().sort_index()

credit_card             206182
debt_consolidation      524215
home_improvement         51829
major_purchase           29847
medical(or)education      8963
other                    55391
small_business           10952
Name: purpose, dtype: int64

### Saving frequency-encoding of "initial_list_status" to disk ----- later used in the Deployment

In [159]:
f = CountFrequencyCategoricalEncoder(encoding_method='frequency',variables=['initial_list_status'])
dict_all = dict(zip([], []))
temp_keys = df_xg[['initial_list_status']].values
temp_values = f.fit_transform(df_xg[['initial_list_status']])
dict_temp = dict(zip(np.array(temp_keys).ravel(),np.array(temp_values).ravel()))
dict_all['initial_list_status'] = dict_temp
import pickle
filehandler = open("initial_list_status.obj","wb")
pickle.dump(dict_all,filehandler)
filehandler.close()

In [160]:
file = open("initial_list_status.obj",'rb')
terms = pickle.load(file)
file.close()
print(terms)

{'initial_list_status': {'f': 0.5148285005617668, 'w': 0.4851714994382333}}


In [161]:
df_xg.initial_list_status.value_counts().sort_index()

f    456848
w    430531
Name: initial_list_status, dtype: int64

### Saving frequency-encoding of "Experience" to disk ----- later used in the Deployment

In [162]:
g = CountFrequencyCategoricalEncoder(encoding_method='frequency',variables=['Experience'])
dict_all = dict(zip([], []))
temp_keys = df_xg[['Experience']].values
temp_values = g.fit_transform(df_xg[['Experience']])
dict_temp = dict(zip(np.array(temp_keys).ravel(),np.array(temp_values).ravel()))
dict_all['Experience'] = dict_temp
import pickle
filehandler = open("Experience.obj","wb")
pickle.dump(dict_all,filehandler)
filehandler.close()

In [163]:
file = open("Experience.obj",'rb')
terms = pickle.load(file)
file.close()
print(terms)

{'Experience': {'9 years': 0.03905546559023822, '< 1 year': 0.07956577741866779, '2 years': 0.08887972332002447, '10+ years': 0.37908717695595684, '5 years': 0.06277362885531436, '8 years': 0.049533513864988915, '7 years': 0.05025361204175442, '4 years': 0.059195676255579636, '1 year': 0.06434116651396979, '3 years': 0.07891329409418073, '6 years': 0.04840096508932486}}


In [164]:
df_xg.Experience.value_counts().sort_index()

1 year        57095
10+ years    336394
2 years       78870
3 years       70026
4 years       52529
5 years       55704
6 years       42950
7 years       44594
8 years       43955
9 years       34657
< 1 year      70605
Name: Experience, dtype: int64