# Bank Loan Term Prediction
---

## Import packages & read data.

In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# Visualization imports
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# %config InlineBackend.figure_format = 'svg'

# Modeling imports
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, roc_auc_score,log_loss, confusion_matrix, precision_score, recall_score, accuracy_score 
from sklearn import linear_model, ensemble , tree 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , VotingClassifier
from sklearn.linear_model import LogisticRegression
import imblearn.over_sampling
from sklearn.svm import SVC  
from sklearn.utils import class_weight
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline, make_pipeline 
from sklearn.metrics import precision_recall_curve
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
import xgboost

In [None]:
df = pd.read_csv('credit_train.csv')
df

In [None]:
df.shape

In [None]:
df.tail()

In [None]:
df.isna().sum()

In [None]:
df.info() 

In [None]:
duplicate = df.duplicated()
print(f'Duplicate in df :', duplicate.sum())

**Rename columns for easer code writing**

In [None]:
df.columns

In [None]:
df.columns = df.columns.str.replace(' ','_')

In [None]:
df.columns

## Split the data for train, validation and test

In [None]:
# X = df.drop(columns='Term')
# y = pd.DataFrame(df['Term'])

# cross val
df_train, df_test  = train_test_split(df, test_size=0.2, random_state=42)

# # split the data for train and test
# df_Train, df_test = train_test_split(df, test_size = 0.2, random_state = 30 )

# # split the train for train and val
# df_train, df_val = train_test_split(df_Train, test_size = 0.2, random_state = 30 )

In [None]:
print(f'Shape of train:', df_train.shape)
# print(f'Shape of validation:', df_val.shape)
print(f'Shape of test:', df_test.shape)

## Data Pre-processing

### Cleaning data

In [None]:
# reset index for train
df_train = df_train.reset_index(drop=True)

# # reset index for val
# df_val = df_val.reset_index(drop=True)

# reset index for val
df_test = df_test.reset_index(drop=True)

In [None]:
# dope nulls in Loan_ID

# for train
df_train = df_train.dropna(subset = ['Loan_ID'])

# # for val
# df_val = df_val.dropna(subset = ['Loan_ID'])

# for test
df_test = df_test.dropna(subset = ['Loan_ID'])

In [None]:
print(f'Shape of train:', df_train.shape)
# print(f'Shape of validation:', df_val.shape)
print(f'Shape of test:', df_test.shape)

In [None]:
df_train.sample(20)

In [None]:
df_train.info()

In [None]:
df_train.isna().sum()

In [None]:
# check for dublicate

# for train
duplicate = df_train.duplicated()
print(f'Duplicate in train :', duplicate.sum())

# # for val
# duplicate = df_val.duplicated()
# print(f'Duplicate in validation :', duplicate.sum())

# for test
duplicate = df_test.duplicated()
print(f'Duplicate in test :', duplicate.sum())
print(f'Shape of train:', df_train.shape)
# print(f'Shape of validation:', df_val.shape)
print(f'Shape of test:', df_test.shape)

In [None]:
# drop duplicates rows
# train
df_train.drop_duplicates(inplace=True)

# val
# df_val.drop_duplicates(inplace=True)

# test
df_test.drop_duplicates(inplace=True)

In [None]:
# check for dublicate

# for train
duplicate = df_train.duplicated()
print(f'Duplicate in train :', duplicate.sum())

# # for val
# duplicate = df_val.duplicated()
# print(f'Duplicate in validation :', duplicate.sum())

# for test
duplicate = df_test.duplicated()
print(f'Duplicate in test :', duplicate.sum())
print(f'Shape of train:', df_train.shape)
# print(f'Shape of validation:', df_val.shape)
print(f'Shape of test:', df_test.shape)

**Duplicate in Loan ID**

In [None]:
df_train['Loan_ID'].value_counts().sort_values(ascending=False)

In [None]:
df_train[df_train.Loan_ID.duplicated()]

In [None]:
df_train[df_train['Loan_ID'] == '7830a00a-20c4-4480-9cf0-fe2f86b5266b']

In [None]:
df_train[df_train['Loan_ID'] == '5a90cbe3-8fee-4582-8823-1f31546dec6e']

We can see an error in data entry. There is a duplicate in loan ID but the difference in current loan amount or null values,

**Now we fix it.**

In [None]:
df_train[(df_train.Loan_ID.duplicated() & (df_train['Current_Loan_Amount'] == 99999999.0))]

In [None]:
# drop duplicate in Loan_ID and Current_Loan_Amount = 99999999.0

# for train
df_train = df_train[~(df_train.Loan_ID.duplicated() & (df_train['Current_Loan_Amount'] == 99999999.0))]

# for val
# df_val = df_val[~(df_val.Loan_ID.duplicated() & (df_val['Current_Loan_Amount'] == 99999999.0))]

# for test
df_test = df_test[~(df_test.Loan_ID.duplicated() & (df_test['Current_Loan_Amount'] == 99999999.0))]

In [None]:
df_train[(df_train.Loan_ID.duplicated())]

In [None]:
df_train[df_train['Loan_ID'] == 'ff486b10-f97d-4dff-bb98-436ef48d8ab1']

In [None]:
# dope nulls in Loan_Status

# for train
df_train = df_train.dropna(subset = ['Annual_Income'])

# # for val
# df_val = df_val.dropna(subset = ['Annual_Income'])

# for test
df_test = df_test.dropna(subset = ['Annual_Income'])

In [None]:
df_train[df_train['Loan_ID'] == 'ff486b10-f97d-4dff-bb98-436ef48d8ab1']

In [None]:
#df_train
df_train.Purpose.unique()
# #df_val
# df_val.Purpose.unique()
#da_test
df_test.Purpose.unique()

In [None]:
df_train.Purpose.value_counts()

In [None]:
#df_train
df_train.Purpose = df_train.Purpose.str.replace('other','Other')
# #df_val
# df_val.Purpose = df_val.Purpose.str.replace('other','Other')
#df_test
df_test.Purpose = df_test.Purpose.str.replace('other','Other')

In [None]:
df_train.Purpose.value_counts()

In [None]:
df_train.Purpose.unique()

In [None]:
df_train.isnull().sum() # train

In [None]:
# dope duplicated in Loan_ID

# for train
df_train = df_train.drop_duplicates(subset = ['Loan_ID'])

# # for val
# df_val = df_val.drop_duplicates(subset = ['Loan_ID'])

# for test
df_test = df_test.drop_duplicates(subset = ['Loan_ID'])

In [None]:
df_train.isnull().sum() # train

In [None]:
print(f'Shape of train:', df_train.shape)
# print(f'Shape of validation:', df_val.shape)
print(f'Shape of test:', df_test.shape)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(df_train['Years_in_current_job'], palette='pink_r');

In [None]:
# fill nulls in Years_in_current_job 

# for train
df_train['Years_in_current_job'] = df_train['Years_in_current_job'].fillna('10+ years')

# # for val
# df_val['Years_in_current_job'] = df_val['Years_in_current_job'].fillna('10+ years')

# for test
df_test['Years_in_current_job'] = df_test['Years_in_current_job'].fillna('10+ years')

In [None]:
# drop Months_since_last_delinquent bc the null > 50&

# train
df_train = df_train.drop(columns='Months_since_last_delinquent')

# test
df_test = df_test.drop(columns='Months_since_last_delinquent')

In [None]:
df_train.isnull().sum()

In [None]:
# drop nulls 

# for train
df_train = df_train.dropna()

# # for val
# df_val = df_val.dropna()

# for test
df_test = df_test.dropna()

In [None]:
df_train.isnull().sum()

In [None]:
df_train.duplicated().sum()

## Feature Engneering
---

In [None]:
df_train.isnull().sum()

In [None]:
df_train.info()

### Get Dummies

In [None]:
# train
bank_lone_train = pd.get_dummies(df_train, columns =['Term','Home_Ownership','Purpose','Loan_Status', 'Years_in_current_job'], drop_first=True) ###

# # val
# bank_lone_val = pd.get_dummies(df_val, columns =['Term','Home_Ownership','Purpose','Loan_Status', 'Years_in_current_job'], drop_first=True) ###

# test
bank_lone_test = pd.get_dummies(df_test, columns =['Term','Home_Ownership','Purpose','Loan_Status' , 'Years_in_current_job'], drop_first=True) ###

In [None]:
bank_lone_train.columns

In [None]:
df_train.corr()

In [None]:
plt.figure(figsize=(10,8))

# corr
data_corr = df_train.corr()
# data_corr = bank_lone_train.corr()

# mask
mask = np.triu(np.ones_like(data_corr, dtype=np.bool))

# adjust mask and df
mask = mask[1:, :-1]
corr = data_corr.iloc[1:,:-1].copy()

sns.heatmap(corr, cmap = 'pink_r', annot = True, vmin= -1, vmax= 1, linewidths=1.5, fmt='.2f', mask=mask);
plt.title('CORRELATION BETWEEN FEATURES\n', loc='left', fontsize=18);
# plt.savefig('plot13.png', dpi = 300, bbox_inches = 'tight');

In [None]:
# sns.pairplot(bank_lone_train, hue = 'Term_Short Term', palette = 'pink_r');

## Visualize data
___

In [None]:
c = ['#724949','#cfa691', '#120f0f', '#a06868']
plt.figure(figsize=(7,7))
plt.pie(x = bank_lone_train['Term_Short Term'].value_counts(),
        labels=['Short term','Long term'],autopct='%.2f%%',
        textprops={'fontsize': 12},explode=[0,0.09], colors = ['#724949','#DEDCBB'])
plt.title('Time Period of Taking Loan',fontdict={'fontsize':15});

In [None]:
plt.figure(figsize=(10,9))
sns.countplot(y='Purpose' , data=df_train, order = df_train['Purpose'].value_counts().index,
              hue='Term', palette = 'pink_r')
plt.title('Purpose of taking Loan' , fontdict={'fontsize':20})
plt.legend(title="Loan type", loc="lower right");

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='Home_Ownership',data=df_train ,order = df_train['Home_Ownership'].value_counts().index
              ,hue='Term',  palette = 'pink_r')
plt.title('Own Property vs Loan Status',fontdict={'fontsize':20})
plt.legend(title="Loan type", loc="upper right", labels=["Short Term","Long Term"]);

### Droping outliers

In [None]:
plt.figure(figsize = [15,20])
plt.subplot(3,2,1)
sns.boxplot(x='Term_Short Term',y='Current_Loan_Amount',
            palette='pink_r', data=bank_lone_train.sort_values('Current_Loan_Amount',ascending=False));
plt.title('Before dropping outliers',fontsize = 15 , c = 'r')

bank_lone_train = bank_lone_train[bank_lone_train['Current_Loan_Amount'] != 99999999]
bank_lone_train = bank_lone_train[((bank_lone_train['Current_Loan_Amount'] <= 600000 )
                                   & (bank_lone_train['Term_Short Term']==1))
                                  | (bank_lone_train['Term_Short Term']==0)]

plt.subplot(3,2,2)
sns.boxplot(x='Term_Short Term',y='Current_Loan_Amount',
            palette='pink_r', data=bank_lone_train.sort_values('Current_Loan_Amount',ascending=False));
plt.title('After dropping outliers',fontsize = 15 );

In [None]:
# bank_lone_val = bank_lone_val[bank_lone_val['Current_Loan_Amount'] != 99999999]
# # bank_lone_val = bank_lone_val[((bank_lone_val['Current_Loan_Amount'] <= 600000 )
#                                    & (bank_lone_val['Term_Short Term']==1))
#                                   | (bank_lone_val['Term_Short Term']==0)]

In [None]:
bank_lone_test = bank_lone_test[bank_lone_test['Current_Loan_Amount'] != 99999999]
bank_lone_test = bank_lone_test[((bank_lone_test['Current_Loan_Amount'] <= 600000 )
                                   & (bank_lone_test['Term_Short Term']==1))
                                  | (bank_lone_test['Term_Short Term']==0)]

In [None]:
plt.figure(figsize = [15,20])
plt.subplot(3,2,1)
sns.boxplot(x='Term_Short Term',y='Credit_Score',
            palette='pink_r', data = bank_lone_train.sort_values('Credit_Score',ascending=False));
plt.title('Before dropping outliers',fontsize = 15 , c = 'r')

bank_lone_train = bank_lone_train.loc[bank_lone_train['Credit_Score'] <= 1500,:]
bank_lone_train = bank_lone_train.loc[bank_lone_train['Credit_Score'] >= 620 ,:]
bank_lone_train = bank_lone_train[((bank_lone_train['Credit_Score'] >= 680 )
                                   & (bank_lone_train['Term_Short Term']==1))| 
                                  (bank_lone_train['Term_Short Term']==0)]

plt.subplot(3,2,2)
sns.boxplot(x='Term_Short Term',y='Credit_Score',
            palette='pink_r', data = bank_lone_train.sort_values('Credit_Score',ascending=False));
plt.title('After dropping outliers',fontsize = 15 );

In [None]:
# bank_lone_val = bank_lone_val.loc[bank_lone_val['Credit_Score'] <= 1500,:]
# bank_lone_val = bank_lone_val.loc[bank_lone_val['Credit_Score'] >= 620 ,:]
# bank_lone_val = bank_lone_val[((bank_lone_val['Credit_Score'] >= 680 )
#                                    & (bank_lone_val['Term_Short Term']==1))| 
#                                   (bank_lone_val['Term_Short Term']==0)]

In [None]:
bank_lone_test = bank_lone_test.loc[bank_lone_test['Credit_Score'] <= 1500,:]
bank_lone_test = bank_lone_test.loc[bank_lone_test['Credit_Score'] >= 620 ,:]
bank_lone_test = bank_lone_test[((bank_lone_test['Credit_Score'] >= 680 )
                                   & (bank_lone_test['Term_Short Term']==1))| 
                                  (bank_lone_test['Term_Short Term']==0)]

In [None]:
plt.figure(figsize = [15,20])
plt.subplot(3,2,1)
sns.boxplot(x='Term_Short Term',y='Annual_Income',
            palette='pink_r', data = bank_lone_train.sort_values('Annual_Income',ascending=False));
plt.title('Before dropping outliers',fontsize = 15 , c = 'r')

bank_lone_train = bank_lone_train.loc[bank_lone_train['Annual_Income'] <= 2750000,:]
bank_lone_train = bank_lone_train[((bank_lone_train['Annual_Income'] <= 2395000 )
                                   & (bank_lone_train['Term_Short Term']==1))
                                  | (bank_lone_train['Term_Short Term']==0)]

plt.subplot(3,2,2)
sns.boxplot(x='Term_Short Term',y='Annual_Income',
            palette='pink_r', data = bank_lone_train.sort_values('Annual_Income',ascending=False));
plt.title('After dropping outliers',fontsize = 15 );

In [None]:
# bank_lone_val = bank_lone_val.loc[bank_lone_val['Annual_Income'] <= 2750000,:]
# bank_lone_val = bank_lone_val[((bank_lone_val['Annual_Income'] <= 2395000 )
#                                    & (bank_lone_val['Term_Short Term']==1))
#                                   | (bank_lone_val['Term_Short Term']==0)]

In [None]:
bank_lone_test = bank_lone_test.loc[bank_lone_test['Annual_Income'] <= 2750000,:]
bank_lone_test = bank_lone_test[((bank_lone_test['Annual_Income'] <= 2395000 )
                                   & (bank_lone_test['Term_Short Term']==1))
                                  | (bank_lone_test['Term_Short Term']==0)]

In [None]:
plt.figure(figsize = [15,20])
plt.subplot(3,2,1)
sns.boxplot(x='Term_Short Term',y='Monthly_Debt',
            palette='pink_r', data=bank_lone_train.sort_values('Monthly_Debt',ascending=False));
plt.title('Before dropping outliers',fontsize = 15 , c = 'b')

bank_lone_train = bank_lone_train.loc[bank_lone_train['Monthly_Debt'] <= 44500,:]
bank_lone_train = bank_lone_train[((bank_lone_train['Monthly_Debt'] <= 36000 )& (bank_lone_train['Term_Short Term']==1))| 
                                  (bank_lone_train['Term_Short Term']==0)]

plt.subplot(3,2,2)
sns.boxplot(x='Term_Short Term',y='Monthly_Debt',
            palette='pink_r', data=bank_lone_train.sort_values('Monthly_Debt',ascending=False));
plt.title('After dropping outliers',fontsize = 15 );

In [None]:
# bank_lone_val = bank_lone_val.loc[bank_lone_val['Monthly_Debt'] <= 44500,:]
# bank_lone_val = bank_lone_val[((bank_lone_val['Monthly_Debt'] <= 36000 )& 
#                                    (bank_lone_val['Term_Short Term']==1))| 
#                                   (bank_lone_val['Term_Short Term']==0)]

In [None]:
bank_lone_test = bank_lone_test.loc[bank_lone_test['Monthly_Debt'] <= 44500,:]
bank_lone_test = bank_lone_test[((bank_lone_test['Monthly_Debt'] <= 36000 )& 
                                   (bank_lone_test['Term_Short Term']==1))| 
                                  (bank_lone_test['Term_Short Term']==0)]

In [None]:
plt.figure(figsize = [15,20])
plt.subplot(3,2,1)
sns.boxplot(x='Term_Short Term',y='Current_Credit_Balance',
            palette='pink_r', data=bank_lone_train.sort_values('Current_Credit_Balance',ascending=False));
plt.title('Before dropping outliers',fontsize = 15 , c = 'r')

bank_lone_train = bank_lone_train.loc[bank_lone_train['Current_Credit_Balance'] <= 760000,:]
bank_lone_train = bank_lone_train[((bank_lone_train['Current_Credit_Balance'] <= 504000 )& 
                                   (bank_lone_train['Term_Short Term']==1))| (bank_lone_train['Term_Short Term']==0)]

plt.subplot(3,2,2)
sns.boxplot(x='Term_Short Term',y='Current_Credit_Balance',
            palette='pink_r', data=bank_lone_train.sort_values('Current_Credit_Balance',ascending=False));
plt.title('After dropping outliers',fontsize = 15 );

In [None]:
# bank_lone_val = bank_lone_val.loc[bank_lone_val['Current_Credit_Balance'] <= 760000,:]
# bank_lone_val = bank_lone_val[((bank_lone_val['Current_Credit_Balance'] <= 504000 )& 
#                                    (bank_lone_val['Term_Short Term']==1))| 
#                                   (bank_lone_val['Term_Short Term']==0)]

In [None]:
bank_lone_test = bank_lone_test.loc[bank_lone_test['Current_Credit_Balance'] <= 760000,:]
bank_lone_test = bank_lone_test[((bank_lone_test['Current_Credit_Balance'] <= 504000 )& 
                                   (bank_lone_test['Term_Short Term']==1))| 
                                  (bank_lone_test['Term_Short Term']==0)]

In [None]:
plt.figure(figsize = [15,20])
plt.subplot(3,2,1)
sns.boxplot(x='Term_Short Term',y='Maximum_Open_Credit',
            palette='pink_r', data=bank_lone_train.sort_values('Maximum_Open_Credit',ascending=False));
plt.title('Before dropping outliers',fontsize = 15 , c = 'r')

bank_lone_train = bank_lone_train.loc[bank_lone_train['Maximum_Open_Credit'] <= 1400000,:]
bank_lone_train = bank_lone_train[((bank_lone_train['Maximum_Open_Credit'] <= 990000 )& 
                                   (bank_lone_train['Term_Short Term']==1))| (bank_lone_train['Term_Short Term']==0)]

plt.subplot(3,2,2)
sns.boxplot(x='Term_Short Term',y='Maximum_Open_Credit',
            palette='pink_r', data=bank_lone_train.sort_values('Maximum_Open_Credit',ascending=False));
plt.title('After dropping outliers',fontsize = 15 );

In [None]:
# bank_lone_val = bank_lone_val.loc[bank_lone_val['Maximum_Open_Credit'] <= 1400000,:]
# bank_lone_val = bank_lone_val[((bank_lone_val['Maximum_Open_Credit'] <= 990000 )& 
#                                    (bank_lone_val['Term_Short Term']==1))| 
#                                   (bank_lone_val['Term_Short Term']==0)]



In [None]:
bank_lone_test = bank_lone_test.loc[bank_lone_test['Maximum_Open_Credit'] <= 1400000,:]
bank_lone_test = bank_lone_test[((bank_lone_test['Maximum_Open_Credit'] <= 990000 )& 
                                   (bank_lone_test['Term_Short Term']==1))| 
                                  (bank_lone_test['Term_Short Term']==0)]

### plot the correlation after one hot coding

In [None]:
plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(bank_lone_train.corr()[
    ['Term_Short Term']].sort_values(by='Term_Short Term',ascending=False),
                      vmin=-1, vmax=1, annot=True,
                      cmap = 'pink_r')
plt.title('CORRELATION BETWEEN FEATURES AFTER ONE HOT CODING\n', loc='center', fontsize=18);

In [None]:
X_train = bank_lone_train.drop(['Term_Short Term','Loan_ID','Customer_ID',
                                'Credit_Score', 'Years_of_Credit_History', 
                                'Number_of_Credit_Problems', 'Number_of_Open_Accounts',
                                'Bankruptcies'], axis = 1)
y_train = bank_lone_train['Term_Short Term']
X_test = bank_lone_test.drop(['Term_Short Term','Loan_ID','Customer_ID',
                                'Credit_Score', 'Years_of_Credit_History', 
                                'Number_of_Credit_Problems', 'Number_of_Open_Accounts',
                                'Bankruptcies'], axis = 1)
y_test = bank_lone_test['Term_Short Term']

In [None]:
model = sm.OLS(y_train,X_train)
fit = model.fit()
fit.summary()

In [None]:
# class count
class_count_0, class_count_1 = bank_lone_train['Term_Short Term'].value_counts()

# Separate class
long_term_0 = bank_lone_train[bank_lone_train['Term_Short Term'] == 0]
short_term_1 = bank_lone_train[bank_lone_train['Term_Short Term'] == 1]# print the shape of the class
print('Long term 0:', long_term_0.shape[0])
print('Short term 1:', short_term_1.shape[0])

## Logistic Regression
---

In [None]:
LR = LogisticRegression(solver='liblinear')
lr_balanced = LogisticRegression(class_weight='balanced', solver='liblinear')
lr_4x = LogisticRegression(class_weight={0 : 1, 1 : 1}, solver='liblinear')

# normal
LR.fit(X_train, y_train)
print('Normal Logistic Regression Train F1:', f1_score(LR.predict(X_train), y_train))
kf = KFold(n_splits=10, random_state=42, shuffle=True)
cr_f1 = cross_val_score(LR, X_train, y_train, scoring='f1', cv=kf)
print('Normal Logistic Regression Valdition F1:',cr_f1.mean() )


# balenced
lr_balanced.fit(X_train, y_train)
print('\nBalanced class weights Logistic Regression Train F1:', f1_score(y_train, lr_balanced.predict(X_train)))
cr_balnced_f1 = cross_val_score(lr_balanced, X_train, y_train, scoring='f1', cv=kf)
print('Balanced class weights Logistic Regression Valdition F1:',cr_balnced_f1.mean())


# weighted
lr_4x.fit(X_train, y_train)
print('\n2:1 class weights Logistic Regression Train F1:', f1_score(y_train, lr_4x.predict(X_train)))
cr_weghts_f1 = cross_val_score(lr_4x, X_train, y_train, scoring='f1', cv=kf)
print('2:1 class weights Logistic Regression Valdition F1:',cr_weghts_f1.mean())

In [None]:
precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_train, LR.predict_proba(X_train)[:,1] )
plt.plot(threshold_curve, precision_curve[1:],label='precision', color = '#724949')
plt.plot(threshold_curve, recall_curve[1:], label='recall', color = '#DEDCBB')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability)');
plt.title('Precision and Recall Curves');

In [None]:
y_predict = (LR.predict_proba(X_train)[:, 1] >= 0.65)

print("Default threshold:")
print("Precision: {:6.4f},   Recall: {:6.4f}".format(precision_score(y_train, y_predict), 
                                                     recall_score(y_train, y_predict)))

In [None]:
y_predict = (lr_4x.predict_proba(X_train)[:, 1] >= 0.624)

loan_confusion = confusion_matrix(y_train, y_predict)

sns.heatmap(loan_confusion , cmap = 'pink_r', annot = True , square = True , fmt = 'd',
           xticklabels = ['long term','short term'],
           yticklabels = ['long term','short term'])

plt.xlabel('prediction')
plt.ylabel('actual');

## KNN Model
---

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
knn.score(X_train, y_train)
scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='f1')
print('f1_score for train: ',f1_score(knn.predict(X_train), y_train))
print('f1_score for validation: ',scores.mean())

## Decision Tree Classifier
---

In [None]:
# normal
Decision_Tree = DecisionTreeClassifier(max_depth = 8)
Decision_Tree.fit(X_train, y_train)
scores = cross_val_score(Decision_Tree, X_train, y_train, cv=10, scoring='f1')
print('Normal Decision Tree Train F1:', f1_score(Decision_Tree.predict(X_train), y_train))
print('Normal Decision Tree Valdition F1:',scores.mean())


# balenced
dt_bal = DecisionTreeClassifier(max_depth = 8, class_weight='balanced')
dt_bal.fit(X_train, y_train)
scores = cross_val_score(dt_bal, X_train, y_train, cv=10, scoring='f1')
print('\nBalanced class weights Decision Tree Train F1:', f1_score(dt_bal.predict(X_train), y_train))
print('Balanced class weights Decision Tree Valdition F1:',scores.mean())


# weighted
dt_wtd = DecisionTreeClassifier(class_weight= {0 : 1, 1 : 1})
dt_wtd.fit(X_train, y_train)
print('\n2:1 class weights Decision Tree Train F1:', f1_score(dt_wtd.predict(X_train), y_train))
print('2:1 class weights Decision Tree Valdition F1:',scores.mean())

In [None]:
precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_train, Decision_Tree.predict_proba(X_train)[:,1] )
plt.plot(threshold_curve, precision_curve[1:],label='precision', color = '#724949')
plt.plot(threshold_curve, recall_curve[1:], label='recall', color = '#DEDCBB')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability)');
plt.title('Precision and Recall Curves');

In [None]:
y_predict = (Decision_Tree.predict_proba(X_train)[:, 1] >= 0.5569)

print("Default threshold:")
print("Precision: {:6.4f},   Recall: {:6.4f}".format(precision_score(y_train, y_predict), 
                                                     recall_score(y_train, y_predict)))

In [None]:
y_predict = (Decision_Tree.predict_proba(X_train)[:, 1] >= 0.61)

loan_confusion = confusion_matrix(y_train, y_predict)

sns.heatmap(fraud_confusion , cmap = 'pink_r', annot = True , square = True , fmt = 'd',
           xticklabels = ['long term','short term'],
           yticklabels = ['long term','short term']);
plt.xlabel('prediction')
plt.ylabel('actual');

## Random Forest Classifier
---

In [None]:
# normal
Random_Forest = RandomForestClassifier(n_estimators = 10, random_state=1)
Random_Forest.fit(X_train, y_train)
scores = cross_val_score(Random_Forest, X_train, y_train, cv=10, scoring='f1')
print('Normal Random Forest Train F1: ',f1_score(Random_Forest.predict(X_train), y_train))
print('Normal Random Forest Valdition F1: ', scores.mean())

# balenced
rf_bal = RandomForestClassifier(n_estimators = 10, random_state=1, class_weight='balanced')
rf_bal.fit(X_train, y_train)
scores = cross_val_score(rf_bal, X_train, y_train, cv=10, scoring='f1')
print('\nBalanced class weights Random Forest Train F1: ', f1_score(rf_bal.predict(X_train), y_train))
print('Balanced class weights Random Forest Valdition F1: ',scores.mean())


# weighted
rf_wtd = RandomForestClassifier(n_estimators = 10, random_state=1, class_weight= {0 : 1, 1 : 1})
rf_wtd.fit(X_train, y_train)
scores = cross_val_score(rf_wtd, X_train, y_train, cv=10, scoring='f1')
print('\n2:1 class weights Random Forest Train F1:', f1_score(rf_wtd.predict(X_train), y_train))
print('2:1 class weights Random Forest Valdition F1:',scores.mean())

In [None]:
precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_train, Random_Forest.predict_proba(X_train)[:,1] )
plt.plot(threshold_curve, precision_curve[1:],label='precision', color = '#724949')
plt.plot(threshold_curve, recall_curve[1:], label='recall', color = '#DEDCBB')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability)');
plt.title('Precision and Recall Curves');

In [None]:
y_predict = (Random_Forest.predict_proba(X_train)[:, 1] >= 0.66)

print("Default threshold:")
print("Precision: {:6.4f},   Recall: {:6.4f}".format(precision_score(y_train, y_predict), 
                                                     recall_score(y_train, y_predict)))

In [None]:
y_predict = (Random_Forest.predict_proba(X_train)[:, 1] >= 0.61)

loan_confusion = confusion_matrix(y_train, y_predict)

sns.heatmap(fraud_confusion , cmap = 'pink_r', annot = True , square = True , fmt = 'd',
           xticklabels = ['long term','short term'],
           yticklabels = ['long term','short term']);
plt.xlabel('prediction')
plt.ylabel('actual');

## Extra Tree
---

In [None]:
Extra_Tree = ExtraTreesClassifier()
Extra_Tree.fit(X_train, y_train)
print('The accuracy for training : ',Extra_Tree.score(X_train,y_train))
print('f1_score for train: ',f1_score(Extra_Tree.predict(X_train), y_train))



In [None]:
# scores = cross_val_score(Extra_Tree,X = X_train, y = y_train, cv =5, scoring = 'accuracy')
# print(scores.mean())

In [None]:
y_predict = (Extra_Tree.predict_proba(X_train)[:, 1] >= 0.1)

print("Default threshold:")
print("Precision: {:6.4f},   Recall: {:6.4f}".format(precision_score(y_train, y_predict), 
                                                     recall_score(y_train, y_predict)))

In [None]:
y_predict = (Extra_Tree.predict_proba(X_val)[:, 1] >= 0.62)

loan_confusion = confusion_matrix(y_val, y_predict)

sns.heatmap(fraud_confusion , cmap = plt.cm.Blues, annot = True , square = True , fmt = 'd',
           xticklabels = ['long term','short term'],
           yticklabels = ['long term','short term']);
plt.xlabel('prediction')
plt.ylabel('actual');

## Stacking
---

In [None]:
stacked = StackingClassifier(
    classifiers=model_vars, meta_classifier=LogisticRegression(), use_probas=False)

stacked.fit(X_train, y_train)
y_pred = stacked.predict(X_train)
accuracy_score(y_train, y_pred)
accuracies = cross_val_score(estimator = stacked, X = X_train, y = y_train, cv = 5)

print('The accuracy for training : ',stacked.score(X_train,y_train))
print('f1_score for train: ',f1_score(stacked.predict(X_train), y_train))
print('The accuracy for validation : ',accuracies)

## Bagging
---

In [None]:

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

# Out-of-Bag Evaluation


bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True)

bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)




## Boosting
---

* ### AdaBoost

In [None]:

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)



* ### Gradient Boosting

In [None]:

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)

y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))


# هنا القراديانت اللي فوق مادري ايش هو
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)



* ### XGBoost

In [None]:


xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

xgb_reg.fit(X_train, y_train,
            eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)


## Voting Classifer (HARD)
---

In [None]:
# normal
models = [('lr', LR ), ('rf', Random_Forest )]
VC = VotingClassifier(estimators= models, voting='hard', n_jobs=-1)
VC.fit(X_train, y_train)
scores = cross_val_score(VC, X_train, y_train, cv=10, scoring='f1')
print('Normal Voting Classifier Train F1: ',f1_score(VC.predict(X_train), y_train))
print('Normal Voting Classifier Valdition F1: ', scores.mean())

# balenced
models = [('lr', lr_balanced ), ('rf', rf_bal )]
vc_bal = VotingClassifier(estimators= models, voting='hard', n_jobs=-1)
vc_bal.fit(X_train, y_train)
scores = cross_val_score(vc_bal, X_train, y_train, cv=10, scoring='f1')
print('\nBalanced class weights Voting Classifier Train F1: ', f1_score(vc_bal.predict(X_train), y_train))
print('Balanced class weights Voting Classifier Valdition F1: ',scores.mean())


# weighted
models = [('lr', lr_4x ), ('rf', rf_wtd )]
vc_wtd = VotingClassifier(estimators= models, voting='hard', n_jobs=-1)
vc_wtd.fit(X_train, y_train)
scores = cross_val_score(vc_wtd, X_train, y_train, cv=10, scoring='f1')
print('\n2:1 class weights Random Forest Train F1:', f1_score(vc_wtd.predict(X_train), y_train))
print('2:1 class weights Random Forest Valdition F1:',scores.mean())

In [None]:
precision_curve, recall_curve, threshold_curve = precision_recall_curve(y_train, VC.predict_proba(X_train)[:,1] )
plt.plot(threshold_curve, precision_curve[1:],label='precision', color = '#724949')
plt.plot(threshold_curve, recall_curve[1:], label='recall', color = '#DEDCBB')
plt.legend(loc='lower left')
plt.xlabel('Threshold (above this probability)');
plt.title('Precision and Recall Curves');

In [None]:
y_predict = (VC.predict_proba(X_train)[:, 1] >= 0.61)

loan_confusion = confusion_matrix(y_train, y_predict)

sns.heatmap(fraud_confusion , cmap = 'pink_r', annot = True , square = True , fmt = 'd',
           xticklabels = ['long term','short term'],
           yticklabels = ['long term','short term']);
plt.xlabel('prediction')
plt.ylabel('actual');