In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.pandas.set_option("display.max_columns",None)

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV

import pickle

In [2]:
df = pd.read_csv('UCI_Credit_Card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [3]:
X=df.drop(['default.payment.next.month',"ID"], axis=1) # splitting the data 
y=df['default.payment.next.month']

In [12]:
# saprating catigorical columns & numerical colums
catigorical_features = X.select_dtypes(include="object").columns
numerical_features = X.select_dtypes(exclude="object").columns
print(catigorical_features)
print(numerical_features)

Index([], dtype='object')
Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')


In [13]:
## nmerical pipline
num_pipline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)

## Catigorical pipline
cato_pipline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("scaler",StandardScaler())
    ]
)

## get Preproceessor obj
preprocessor = ColumnTransformer([
    ("num_pipline",num_pipline,numerical_features),
])

In [16]:
# Train test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=10)

In [17]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(24000, 23)
(24000,)
(6000, 23)
(6000,)


In [18]:
# transform data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
models = {
    "Logistic Regression": LogisticRegression(), 
    "SVC Classifier": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(), 
    "GaussianNB Classifier": GaussianNB()
}

In [19]:
def evaluate_clf(true, predicted):
    '''
    This function takes in true values and predicted values &
    Returns: Accuracy, F1-Score, Precision, Recall, Roc-auc Score
    '''
    acc = accuracy_score(true, predicted) # Calculate Accuracy
    f1 = f1_score(true, predicted) # Calculate F1-score
    precision = precision_score(true, predicted) # Calculate Precision
    recall = recall_score(true, predicted)  # Calculate Recall
    roc_auc = roc_auc_score(true, predicted) #Calculate Roc
    return acc, f1 , precision, recall, roc_auc

In [20]:
def evaluate_models(X_train, y_train,X_test,y_test,models,params):
        report = {}

        for i in range(len(list(models))):
            model = list(models.values())[i]
            para=params[list(models.keys())[i]]

            gs = GridSearchCV(model,para,cv=3)
            gs.fit(X_train,y_train)

            model.set_params(**gs.best_params_)
            model.fit(X_train,y_train)
            
            # Make predictions
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)

        # Training set performance
            model_train_accuracy, model_train_f1,model_train_precision,\
            model_train_recall,model_train_rocauc_score=evaluate_clf(y_train ,y_train_pred)
    


        # Test set performance
            model_test_accuracy,model_test_f1,model_test_precision,\
            model_test_recall,model_test_rocauc_score=evaluate_clf(y_test, y_test_pred)
       

            print(list(models.keys())[i])
            models_list.append(list(models.keys())[i])

            print('Model performance for Training set')
            print("- Accuracy: {:.4f}".format(model_train_accuracy))
            print('- F1 score: {:.4f}'.format(model_train_f1)) 
            print('- Precision: {:.4f}'.format(model_train_precision))
            print('- Recall: {:.4f}'.format(model_train_recall))
            print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

            print('----------------------------------')

            print('Model performance for Test set')
            print('- Accuracy: {:.4f}'.format(model_test_accuracy))
            print('- F1 score: {:.4f}'.format(model_test_f1))
            print('- Precision: {:.4f}'.format(model_test_precision))
            print('- Recall: {:.4f}'.format(model_test_recall))
            print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
    
            print('='*35)
            print('\n')

            report[list(models.keys())[i]] = model_test_accuracy

        return report

In [21]:
evaluate_models(X_train, y_train,X_test,y_test,models,params)

Decision Tree
Model performance for Training set
- Accuracy: 0.7670
- F1 score: 0.4986
- Precision: 0.4765
- Recall: 0.5228
- Roc Auc Score: 0.6797
----------------------------------
Model performance for Test set
- Accuracy: 0.7697
- F1 score: 0.5025
- Precision: 0.4778
- Recall: 0.5300
- Roc Auc Score: 0.6835


Random Forest Classifier
Model performance for Training set
- Accuracy: 0.9739
- F1 score: 0.9426
- Precision: 0.9198
- Recall: 0.9665
- Roc Auc Score: 0.9713
----------------------------------
Model performance for Test set
- Accuracy: 0.8145
- F1 score: 0.4952
- Precision: 0.6149
- Recall: 0.4146
- Roc Auc Score: 0.6708


Logistic Regression
Model performance for Training set
- Accuracy: 0.7790
- F1 score: 0.5094
- Precision: 0.5014
- Recall: 0.5178
- Roc Auc Score: 0.6856
----------------------------------
Model performance for Test set
- Accuracy: 0.7833
- F1 score: 0.5109
- Precision: 0.5063
- Recall: 0.5156
- Roc Auc Score: 0.6871


SVC Classifier
Model performance for T

{'Decision Tree': 0.7696666666666667,
 'Random Forest Classifier': 0.8145,
 'Logistic Regression': 0.7833333333333333,
 'SVC Classifier': 0.824,
 'GaussianNB': 0.7865}