## Credit Card approval prediction

#### Importing dependencies

In [3]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [4]:
cc_apps = pd.read_csv('cc_approvals.data', header=None)
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


## Getting to know the data and cleaning it

#### We get some statistics and information on the data

In [5]:
cc_apps_description = cc_apps.describe()
print(cc_apps_description)
print("\n")

cc_apps_info = cc_apps.info()
print(cc_apps_info)
print("\n")

cc_apps.tail(20)

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 no

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
670,b,47.17,5.835,u,g,w,v,5.5,f,f,0,f,g,465,150,-
671,b,25.83,12.835,u,g,cc,v,0.5,f,f,0,f,g,0,2,-
672,a,50.25,0.835,u,g,aa,v,0.5,f,f,0,t,g,240,117,-
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-


#### We replace the missing values with NaNs

In [6]:
cc_apps = cc_apps.replace('?', np.NaN)

cc_apps.tail(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
670,b,47.17,5.835,u,g,w,v,5.5,f,f,0,f,g,465,150,-
671,b,25.83,12.835,u,g,cc,v,0.5,f,f,0,f,g,0,2,-
672,a,50.25,0.835,u,g,aa,v,0.5,f,f,0,t,g,240,117,-
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-


In [7]:
cc_apps.fillna(cc_apps.mean())

cc_apps.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

#### We check the column datatypes and impute missing values with the mode

In [8]:
for col in cc_apps.columns:
    if cc_apps[col].dtypes == 'object':
        cc_apps = cc_apps.fillna(cc_apps[col].value_counts().index[0])

cc_apps.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

### Feature engineering

#### Imputing missing values with the column mode

In [9]:
le = LabelEncoder()

for col in cc_apps.columns.values:
    if cc_apps[col].dtypes=='object':
        cc_apps[col]=le.fit_transform(cc_apps[col])

#### We drop 2 columns and perform a train-test split

In [10]:
cc_apps = cc_apps.drop([11,13], axis=1)
cc_apps = cc_apps.values

X,y = cc_apps[:,0:-1] , cc_apps[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X,
                                y,
                                test_size=.33,
                                random_state=42)

#### Scaling the data

In [11]:
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

#### Fit a logistic regression model

In [12]:
logreg = LogisticRegression()

logreg.fit(rescaledX_train, y_train)

LogisticRegression()

#### How does our model perform?

In [15]:
y_pred = logreg.predict(rescaledX_test)

print(f"Accuracy of logistic regression classifier: {logreg.score(rescaledX_test, y_test):.3f}")

confusion_matrix(y_test, y_pred)

Accuracy of logistic regression classifier: 0.842


array([[94,  9],
       [27, 98]], dtype=int64)

#### Perform a GridSearch hyperparameter tuning

In [16]:
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

param_grid = dict({'tol': tol, 'max_iter': max_iter})

In [19]:
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

rescaledX = scaler.fit_transform(X)

grid_model_result = grid_model.fit(rescaledX, y)

best_score, best_params = grid_model.best_score_, grid_model.best_params_
print(f"Best: {best_score:.3f} using {best_params}")

Best: 0.851 using {'max_iter': 100, 'tol': 0.01}


Conclusion: this model enables us to predict credit card approval with a 85% accuracy. What would be better is to have a model that tells us when the approval process has to be handled by human amployees and when it can handle it.