<a href="https://colab.research.google.com/github/harshitbhavnani/Credit-Risk-Analysis-Project/blob/main/Credit_Risk_Analysis_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients#


**Importing Libraries**

In [None]:
import warnings
warnings.filterwarnings('ignore')

!pip install scikit-learn==0.22.1



In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

from sklearn.model_selection import GridSearchCV

**Loading Dataset**

In [None]:
df=pd.read_csv("dataset.csv")
df.head(5)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [None]:
df.isnull().sum().sum()

0

In [None]:
df.dtypes

ID                              int64
LIMIT_BAL                     float64
SEX                             int64
EDUCATION                       int64
MARRIAGE                        int64
AGE                             int64
PAY_0                           int64
PAY_2                           int64
PAY_3                           int64
PAY_4                           int64
PAY_5                           int64
PAY_6                           int64
BILL_AMT1                     float64
BILL_AMT2                     float64
BILL_AMT3                     float64
BILL_AMT4                     float64
BILL_AMT5                     float64
BILL_AMT6                     float64
PAY_AMT1                      float64
PAY_AMT2                      float64
PAY_AMT3                      float64
PAY_AMT4                      float64
PAY_AMT5                      float64
PAY_AMT6                      float64
default.payment.next.month      int64
dtype: object

In [None]:
df.shape

(30000, 25)

In [None]:
df['default.payment.next.month'].value_counts()

0    23364
1     6636
Name: default.payment.next.month, dtype: int64

# **Preprocessing Data**
The column named 'ID' is removed as it is a redundant feature

In [None]:
df = df.drop(["ID"],axis=1)

Checking for Null values

In [None]:
df.apply(lambda x: sum(x.isnull()),axis=0)

LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

Column named Pay_0 is renamed to Pay_1 for better understanding

In [None]:
df.rename(columns={'PAY_0':'PAY_1'}, inplace=True)

Reducing levels in the 'Education' column for better results

In [None]:
df['EDUCATION'].value_counts()

2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: EDUCATION, dtype: int64

In [None]:
df["EDUCATION"]=df["EDUCATION"].map({0:4,1:1,2:2,3:3,4:4,5:4,6:4})

Merging marriage levels '0' and '3' because they mean the same

In [None]:
df['MARRIAGE'].value_counts()

2    15964
1    13659
3      323
0       54
Name: MARRIAGE, dtype: int64

In [None]:
df["MARRIAGE"]=df["MARRIAGE"].map({0:3,1:1,2:2,3:3})

**Standardising Data**

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit_transform(df)

array([[0.01010101, 1.        , 0.33333333, ..., 0.        , 0.        ,
        1.        ],
       [0.11111111, 1.        , 0.33333333, ..., 0.        , 0.00378311,
        1.        ],
       [0.08080808, 1.        , 0.33333333, ..., 0.00234451, 0.00945777,
        0.        ],
       ...,
       [0.02020202, 0.        , 0.33333333, ..., 0.00468901, 0.00586382,
        1.        ],
       [0.07070707, 0.        , 0.66666667, ..., 0.12417444, 0.00341236,
        1.        ],
       [0.04040404, 0.        , 0.33333333, ..., 0.00234451, 0.00189155,
        1.        ]])

**Splitting Data into Train and Test Data**

In [None]:
X = df.drop('default.payment.next.month',axis=1)
y = df['default.payment.next.month']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# **Comparing Various Machine Learning Models**

In [None]:
LogisticRegression = LogisticRegression(random_state=10,max_iter=1000)
RandomForest = RandomForestClassifier(random_state=10)
GradientBoost = GradientBoostingClassifier(random_state=10)
DecisionTree = DecisionTreeClassifier(random_state=10)
KNN = KNeighborsClassifier()
SGD = SGDClassifier(random_state=10)
GaussianNB = GaussianNB()
LDA = LinearDiscriminantAnalysis()
SVC = SVC(random_state=10)
AdaBoost = AdaBoostClassifier(random_state=10)
XGB = XGBClassifier(random_state=10)

In [None]:
LogisticRegression.fit(X_train,y_train)
RandomForest.fit(X_train,y_train)
GradientBoost.fit(X_train,y_train)
DecisionTree.fit(X_train,y_train)
KNN.fit(X_train,y_train)
SGD.fit(X_train,y_train)
GaussianNB.fit(X_train,y_train)
LDA.fit(X_train,y_train)
SVC.fit(X_train,y_train)
AdaBoost.fit(X_train,y_train)
XGB.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=10,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
scores = pd.DataFrame(columns=['Logistic Regression',
                                      'Random Forest','KNN',
                                      'Gradient Boost','SGD',
                                      'Decision Tree','LDA','GaussianNB',
                                      'XGBoost','SVC','AdaBoost'],
                      index=['Accuracy','Recall','Precision','F1'])

In [None]:
y_pred = [None]*11

y_pred[0] = LogisticRegression.predict(X_test)
y_pred[1] = RandomForest.predict(X_test)
y_pred[2] = KNN.predict(X_test)
y_pred[3] = GradientBoost.predict(X_test)
y_pred[4] = SGD.predict(X_test)
y_pred[5] = DecisionTree.predict(X_test)
y_pred[6] = LDA.predict(X_test)
y_pred[7] = GaussianNB.predict(X_test)
y_pred[8] = XGB.predict(X_test)
y_pred[9] = SVC.predict(X_test)
y_pred[10] = AdaBoost.predict(X_test)

In [None]:
for i in range(11):

  scores.iloc[0][i]=accuracy_score(y_test, y_pred[i])
  scores.iloc[1][i]=recall_score(y_test, y_pred[i],average='weighted')
  scores.iloc[2][i]=precision_score(y_test, y_pred[i],average='weighted')
  scores.iloc[3][i]=f1_score(y_test, y_pred[i],average='weighted')

In [None]:
scores

Unnamed: 0,Logistic Regression,Random Forest,KNN,Gradient Boost,SGD,Decision Tree,LDA,GaussianNB,XGBoost,SVC,AdaBoost
Accuracy,0.781167,0.814833,0.756,0.819833,0.781167,0.7295,0.809667,0.3815,0.820833,0.781167,0.816667
Recall,0.781167,0.814833,0.756,0.819833,0.781167,0.7295,0.809667,0.3815,0.820833,0.781167,0.816667
Precision,0.610221,0.795513,0.708622,0.802012,0.610221,0.736821,0.789711,0.741742,0.803417,0.610221,0.798529
F1,0.685193,0.794813,0.721767,0.79783,0.685193,0.73297,0.773427,0.379686,0.798653,0.685193,0.788229


**Hyper-Parameter Tuning**

As XGBoost Classifier gives maximum accuracy, we will be tuning its parameters for finding the best result and building the most accurate Machine Learning model

In [None]:
max_depth = [1, 2, 3, 4]
min_child_weight = [5, 7, 9]
subsample = [0.3 , 0.5]
colsample_bytree = [0.7, 0.9]
objective = ['reg:squarederror']
n_estimators = [150,200,250]

hyperparameters = dict(max_depth=max_depth, min_child_weight=min_child_weight,
                       colsample_bytree=colsample_bytree, n_estimators = n_estimators,
                       subsample=subsample, objective=objective)

xgb_model = XGBClassifier()

clf = GridSearchCV(xgb_model, hyperparameters, cv=5)

best_model = clf.fit(X_train,y_train)

print('Best learning_rate:', best_model.best_estimator_.get_params()['learning_rate'])
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best min_child_weight:', best_model.best_estimator_.get_params()['min_child_weight'])
print('Best subsample:', best_model.best_estimator_.get_params()['subsample'])
print('Best colsample_bytree:', best_model.best_estimator_.get_params()['colsample_bytree'])
print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])
print('Best objective:', best_model.best_estimator_.get_params()['objective'])

Best learning_rate: 0.1
Best max_depth: 3
Best min_child_weight: 5
Best subsample: 0.5
Best colsample_bytree: 0.7
Best n_estimators: 200
Best objective: reg:squarederror


**Best Parameters for XGBoost model**

*   Best learning_rate: 0.1
*   Best max_depth: 3
*   Best min_child_weight: 5
*   Best subsample: 0.5
*   Best colsample_bytree: 0.7
*   Best n_estimators: 200
*   Objective: reg:squarederror






















**Comparing Hypertuned Model with Normal model**

In [None]:
normal = XGBClassifier()
best = XGBClassifier(learning_rate = 0.1, max_depth = 3, min_child_weight = 5,
                     subsample = 0.5, colsample_bytree = 0.7,
                      n_estimators = 200, objective = 'reg:squarederror')

normal.fit(X_train,y_train)
best.fit(X_train,y_train)

y_pred1 = normal.predict(X_test)
y_pred2 = best.predict(X_test)

print('Accuracy of Normal XGBoost Model: ',accuracy_score(y_test, y_pred1))
print('Accuracy of Best XGBoost Model: ',accuracy_score(y_test, y_pred2))
print('Recall Score of Normal XGBoost Model: ',recall_score(y_test, y_pred1))
print('Recall Score of Best XGBoost Model: ',recall_score(y_test, y_pred2))

Accuracy of Normal XGBoost Model:  0.8208333333333333
Accuracy of Best XGBoost Model:  0.8223333333333334
Recall Score of Normal XGBoost Model:  0.3541507996953541
Recall Score of Best XGBoost Model:  0.3571972581873572


In [None]:
import pickle

with open('model_pkl', 'wb') as files:
    pickle.dump(best, files)

**Finding the most Important Features in a XGBoost Classifier**

In [None]:
features = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

In [None]:
def get_feature_importance(clsf, ftrs):
    imp = clsf.feature_importances_.tolist()
    feat = ftrs
    result = pd.DataFrame({'feat':feat,'score':imp})
    result = result.sort_values(by=['score'],ascending=False)
    return result

get_feature_importance(best, features)

Unnamed: 0,feat,score
5,PAY_1,0.333233
6,PAY_2,0.135298
8,PAY_4,0.058852
7,PAY_3,0.052785
9,PAY_5,0.038839
10,PAY_6,0.033695
17,PAY_AMT1,0.032621
0,LIMIT_BAL,0.026829
19,PAY_AMT3,0.023864
2,EDUCATION,0.023857


# **Building Deep Learning Models**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
model = keras.Sequential()
model.add(layers.Dense(50, input_dim=(23), activation='relu'))
model.add(layers.Dense(40, activation='relu'))
model.add(layers.Dense(30, activation='relu'))
model.add(layers.Dense(20, activation='relu'))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1,  activation='sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 100
batch_size = 512

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                1200      
_________________________________________________________________
dense_1 (Dense)              (None, 40)                2040      
_________________________________________________________________
dense_2 (Dense)              (None, 30)                1230      
_________________________________________________________________
dense_3 (Dense)              (None, 20)                620       
_________________________________________________________________
dense_4 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 5,311
Trainable params: 5,311
Non-trainable params: 0
______________________________________________________

In [None]:
model.evaluate(np.array(X_test),np.array(y_test))



[0.0, 0.781166672706604]

In [None]:
y_pred=model.predict(np.array(X_test))

In [None]:
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('Recall Score: ', recall_score(y_test, y_pred,average='weighted'))
print('Precision Score', precision_score(y_test, y_pred,average='weighted'))
print('F1 Score', f1_score(y_test, y_pred,average='weighted'))

Accuracy Score:  0.7811666666666667
Recall Score:  0.7811666666666667
Precision Score 0.6102213611111111
F1 Score 0.6851928823180812


# Flask App

In [None]:
!pip install flask gevent requests pillow

Collecting gevent
  Downloading gevent-21.12.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 3.9 MB/s 
Collecting zope.interface
  Downloading zope.interface-5.4.0-cp37-cp37m-manylinux2010_x86_64.whl (251 kB)
[K     |████████████████████████████████| 251 kB 55.2 MB/s 
Collecting zope.event
  Downloading zope.event-4.5.0-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: zope.interface, zope.event, gevent
Successfully installed gevent-21.12.0 zope.event-4.5.0 zope.interface-5.4.0


In [None]:
procfile = 'web: gunicorn app:app'
procfiles= open("/content/Procfile","w")
procfiles.write(procfile)
procfiles.close()

In [None]:
!pip install flask-ngrok
from flask_ngrok import run_with_ngrok
from flask import Flask

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
a =  """<html>
    <head>
        <title>{{ title }} Credit Risk Analysis Server </title>
        <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
    </head>
    <body>
        <form method="POST">
        
            <textarea name="LIMIT_BAL" placeholder="Balance Limit" rows="15" cols="10"></textarea><br><br>
            <textarea name="SEX" placeholder="Gender" rows="10" cols="15"></textarea><br><br>
            <textarea name="EDUCATION" placeholder="Educational Qualification" rows="15" cols="10"></textarea><br><br>
            <textarea name="MARRIAGE" placeholder="Marital Status" rows="15" cols="10"></textarea><br><br>
            <textarea name="AGE" placeholder="AGE" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_1" placeholder="PAY_1" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_2" placeholder="PAY_2" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_3" placeholder="PAY_3" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_4" placeholder="PAY_4" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_5" placeholder="PAY_5" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_6" placeholder="PAY_6" rows="15" cols="10"></textarea><br><br>
            <textarea name="BILL_AMT1" placeholder="BILL_AMT1" rows="15" cols="10"></textarea><br><br>
            <textarea name="BILL_AMT2" placeholder="BILL_AMT2" rows="15" cols="10"></textarea><br><br>
            <textarea name="BILL_AMT3" placeholder="BILL_AMT3" rows="15" cols="10"></textarea><br><br>
            <textarea name="BILL_AMT4" placeholder="BILL_AMT4" rows="15" cols="10"></textarea><br><br>
            <textarea name="BILL_AMT5" placeholder="BILL_AMT5" rows="15" cols="10"></textarea><br><br>
            <textarea name="BILL_AMT6" placeholder="BILL_AMT6" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_AMT1" placeholder="PAY_AMT1" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_AMT2" placeholder="PAY_AMT2" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_AMT3" placeholder="PAY_AMT3" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_AMT4" placeholder="PAY_AMT4" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_AMT5" placeholder="PAY_AMT5" rows="15" cols="10"></textarea><br><br>
            <textarea name="PAY_AMT6" placeholder="PAY_AMT6" rows="15" cols="10"></textarea><br><br>
        
            <input class="example_a" type="submit">
        </form>
        {% if final %}
        <div>
            <h2>The Probability of the customer defaulting is {{ final }} !</h2>
            {% else %}
            <p></p>
            {% endif %}
        </div>
        <span>by Harshit Bhavnani</span>
    </body>
</html>"""

In [None]:
!mkdir '/content/templates'
!mkdir '/content/uploads'

Html_file = open("/content/templates/form.html", "w")
Html_file.write(a)
Html_file.close()

In [None]:
import os
from flask import Flask, render_template, request

app = Flask(__name__)
run_with_ngrok(app)
app.config['UPLOADS'] = 'uploads'

@app.route('/')
def my_form():
    return render_template('form.html')

def predictions(arr):
    with open('model_pkl' , 'rb') as f: 
      mymodel = pickle.load(f)
    ans = mymodel.predict_proba(arr)
    return ans

@app.route('/', methods=['POST'])
def my_form_post():
    LIMIT_BAL = request.form['LIMIT_BAL']
    SEX = request.form['SEX']
    EDUCATION = request.form['EDUCATION']
    MARRIAGE = request.form['MARRIAGE']
    AGE = request.form['AGE']
    PAY_1 = request.form['PAY_1']
    PAY_2 = request.form['PAY_2']
    PAY_3 = request.form['PAY_3']
    PAY_4 = request.form['PAY_4']
    PAY_5 = request.form['PAY_5']
    PAY_6 = request.form['PAY_6']
    BILL_AMT_1 = request.form['BILL_AMT_1']
    BILL_AMT_2 = request.form['BILL_AMT_2']
    BILL_AMT_3 = request.form['BILL_AMT_3']
    BILL_AMT_4 = request.form['BILL_AMT_4']
    BILL_AMT_5 = request.form['BILL_AMT_5']
    BILL_AMT_6 = request.form['BILL_AMT_6']
    PAY_AMT_1 = request.form['PAY_AMT_1']
    PAY_AMT_2 = request.form['PAY_AMT_2']
    PAY_AMT_3 = request.form['PAY_AMT_3']
    PAY_AMT_4 = request.form['PAY_AMT_4']
    PAY_AMT_5 = request.form['PAY_AMT_5']
    PAY_AMT_6 = request.form['PAY_AMT_6']
    
    arr = [LIMIT_BAL, SEX, EDUCATION, MARRIAGE, AGE, PAY_1, PAY_2,
       PAY_3, PAY_4, PAY_5, PAY_6, BILL_AMT1, BILL_AMT2,
       BILL_AMT3, BILL_AMT4, BILL_AMT5, BILL_AMT6, PAY_AMT1,
       PAY_AMT2, PAY_AMT3, PAY_AMT4, PAY_AMT5, PAY_AMT6]

    answer = predictions(arr)

    return render_template('form.html', final=answer, text1=text1)

if __name__=='__main__':
    app.run()