In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV


In [2]:
cc_apps = pd.read_csv("credit-approval_csv.csv", header=None)
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
1,b,30.83,0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
2,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
3,a,24.50,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
4,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+


The output may appear a bit confusing at its first sight, but let's try to figure out the most important features of a credit card application. The features of this dataset have been anonymized to protect the privacy, but this [blog](http://rstudio-pubs-static.s3.amazonaws.com/73039_9946de135c0a49daa7a0a9eda4a67a72.html) gives us a pretty good overview of the probable features. 

The probable features in a typical credit card application are Gender, Age, Debt, Married, BankCustomer, EducationLevel, Ethnicity, YearsEmployed, PriorDefault, Employed, CreditScore, DriversLicense, Citizen, ZipCode, Income and finally the ApprovalStatus. 

This gives us a pretty good starting point, and we can map these features with respect to the columns in the output.

As we can see from our first glance at the data, the dataset has a mixture of numerical and non-numerical features. This can be fixed with some preprocessing, but before we do that, let's learn a bit more about the dataset a bit more to see if there are other dataset issues that need to be fixed.



In [3]:
cc_apps.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
count,679,679.0,691.0,685,685,682,682,691,691,691,691,691,691,678,691,691
unique,3,350.0,216.0,4,4,15,10,133,3,3,24,3,4,171,241,3
top,b,22.67,1.5,u,g,c,v,0,t,f,0,f,g,0,0,-
freq,468,9.0,21.0,519,519,137,399,70,361,395,395,374,625,132,295,383


In [4]:
cc_apps_info = cc_apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 691 entries, 0 to 690
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       679 non-null    object
 1   1       679 non-null    object
 2   2       691 non-null    object
 3   3       685 non-null    object
 4   4       685 non-null    object
 5   5       682 non-null    object
 6   6       682 non-null    object
 7   7       691 non-null    object
 8   8       691 non-null    object
 9   9       691 non-null    object
 10  10      691 non-null    object
 11  11      691 non-null    object
 12  12      691 non-null    object
 13  13      678 non-null    object
 14  14      691 non-null    object
 15  15      691 non-null    object
dtypes: object(16)
memory usage: 86.5+ KB


In [5]:
cc_apps.tail(17)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
674,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
675,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
676,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
677,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
678,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
679,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
680,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
681,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
682,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
683,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


## Handling the missing values

We replaced all the question marks with NaNs. This is going to help us in the next missing value treatment that we are going to perform.

An important question that gets raised here is why are we giving so much importance to missing values? Can't they be just ignored? Ignoring missing values can affect the performance of a machine learning model heavily. While ignoring the missing values our machine learning model may miss out on information about the dataset that may be useful for its training. Then, there are many models which cannot handle missing values implicitly such as LDA.

So, to avoid this problem, we are going to impute the missing values with a strategy called mean imputation.

In [6]:
cc_apps = cc_apps.fillna(cc_apps.mean())


In [7]:
print(cc_apps.isnull().values.sum())


67


In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cc_apps.columns:
    # Compare if the dtype is object
    if cc_apps[col].dtype=='object':
    # Use LabelEncoder to do the numeric transformation
        cc_apps[col]=le.fit_transform(cc_apps[col].astype(str))


We have successfully converted all the non-numeric values to numeric ones.

Now, let's try to understand what these scaled values mean in the real world. Let's use CreditScore as an example. The creidt score of a person is their creditworthiness based on their credit history. The higher this number, the more financially trustworthy a person is considered to be. So, a CreditScore of 1 is the highest since we're rescaling all the values to the range of 0-1.

Also, features like DriversLicense and ZipCode are not as important as the other features in the dataset for predicting credit card approvals. We should drop them to design our machine learning model with the best set of features. This is often called feature engineering or, more specifically, feature selection.



In [10]:
from sklearn.preprocessing import MinMaxScaler

# Drop features 10 and 13 and convert the DataFrame to a NumPy array
cc_apps = cc_apps.drop([cc_apps.columns[10],cc_apps.columns[13]], axis=1)
cc_apps = cc_apps.values

# Segregate features and labels into separate variables
X,y = cc_apps[:,0:13], cc_apps[:,13]


# Instantiate MinMaxScaler and use it to rescale
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(X)


In [11]:
from sklearn.model_selection import train_test_split

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(rescaledX,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)


In [12]:
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
# ... YOUR CODE FOR TASK 9 ...
logreg.fit(X_train,y_train)


LogisticRegression()

In [13]:
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(X_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(X_test, y_test))
confusion_matrix(y_test, y_pred)


Accuracy of logistic regression classifier:  0.834061135371179


array([[93, 10],
       [28, 98]], dtype=int64)

##  Grid searching and making the model perform better

In [14]:
from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)
print(param_grid)


{'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]}


In [15]:
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_,grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))




Best: 0.841122 using {'max_iter': 100, 'tol': 0.01}
