## Predicting credit card approval

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn import svm, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [102]:
df = pd.read_csv("dataset.csv")
df.head(10)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Education,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
5,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+
6,b,33.17,1.04,u,g,r,h,6.5,t,f,0,t,g,164,31285,+
7,a,22.92,11.585,u,g,cc,v,0.04,t,f,0,f,g,80,1349,+
8,b,54.42,0.5,y,p,k,h,3.96,t,f,0,f,g,180,314,+
9,b,42.5,4.915,y,p,w,v,3.165,t,f,0,t,g,52,1442,+


In [103]:
df.describe()

Unnamed: 0,Debt,YearsEmployed,CreditScore,Income
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [104]:
df.isnull().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
Education         0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
ApprovalStatus    0
dtype: int64

In [105]:
print('Garbage value at index 79 is: ', df.Age.unique()[78])
df.Age.replace('?', 0.00, inplace=True)
df['Age'] = pd.to_numeric(df['Age'])
df.Age[78] = df.Age.mean()

Garbage value at index 79 is:  ?


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [106]:
df.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Education,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [107]:
numeric_cols = ['Debt', 'YearsEmployed', 'CreditScore', 'Income', 'Age','ZipCode']
categorical_cols = list(set(df.columns) - set(numeric_cols))
categorical_cols

['Gender',
 'Married',
 'DriversLicense',
 'ApprovalStatus',
 'BankCustomer',
 'Ethnicity',
 'Employed',
 'PriorDefault',
 'Citizen',
 'Education']

In [108]:
le = LabelEncoder()

for col in categorical_cols:
    df[col]=le.fit_transform(df[col])

In [109]:
df.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Education,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
0,2,30.83,0.0,2,1,13,8,1.25,1,1,1,0,0,202,0,0
1,1,58.67,4.46,2,1,11,4,3.04,1,1,6,0,0,43,560,0
2,1,24.5,0.5,2,1,11,4,1.5,1,0,0,0,0,280,824,0
3,2,27.83,1.54,2,1,13,8,3.75,1,1,5,1,0,100,3,0
4,2,20.17,5.625,2,1,13,8,1.71,1,0,0,0,2,120,0,0


In [110]:
df = df.drop(['DriversLicense', 'ZipCode'], axis=1)
df.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Education,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Citizen,Income,ApprovalStatus
0,2,30.83,0.0,2,1,13,8,1.25,1,1,1,0,0,0
1,1,58.67,4.46,2,1,11,4,3.04,1,1,6,0,560,0
2,1,24.5,0.5,2,1,11,4,1.5,1,0,0,0,824,0
3,2,27.83,1.54,2,1,13,8,3.75,1,1,5,0,3,0
4,2,20.17,5.625,2,1,13,8,1.71,1,0,0,2,0,0


In [111]:
data_array = df.values
X,y = data_array[:,0:13] , data_array[:,13]

### Scaling and dividing test-train values

In [112]:
df = df.apply(pd.to_numeric)

X = df.values[:,0:13]
y = df.values[:,13]

scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=7, shuffle=True)

### Checking the accuracy and confusion matrix for different models

In [185]:
classifiers = []

model1 = LogisticRegression(solver='lbfgs')
classifiers.append(model1)
model2 = svm.SVC(gamma='auto')
classifiers.append(model2)
model3 = tree.DecisionTreeClassifier()
classifiers.append(model3)
model4 = RandomForestClassifier(n_estimators=100)
classifiers.append(model4)
model5 = GradientBoostingClassifier(n_estimators=100)
classifiers.append(model5)

In [186]:
for clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred= clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy of model is %s"%(acc))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix of model is:\n %s\n"%(cm))

Accuracy of model is 0.8768115942028986
Confusion Matrix of model is:
 [[51  6]
 [11 70]]

Accuracy of model is 0.8623188405797102
Confusion Matrix of model is:
 [[51  6]
 [13 68]]

Accuracy of model is 0.8188405797101449
Confusion Matrix of model is:
 [[47 10]
 [15 66]]

Accuracy of model is 0.8695652173913043
Confusion Matrix of model is:
 [[50  7]
 [11 70]]

Accuracy of model is 0.8405797101449275
Confusion Matrix of model is:
 [[49  8]
 [14 67]]



### Increasing the accuracy of random forest classifier by using cross validation for different hyperparameters 

In [193]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 20)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 50, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [194]:
rf_random = RandomizedSearchCV(estimator = model4,
                               param_distributions = random_grid,
                               n_iter = 100, cv = 3, verbose=2,
                               random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   24.9s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=100,
                                                    n_jobs=None,
 

In [195]:
rf_random.best_params_

{'n_estimators': 115,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 25,
 'bootstrap': False}

In [198]:
rf_random.best_score_

0.8768115942028986

### Thus accuracy is incresed from 86.9% to 87.68%, which is equal to logistic regression classifier. Thus these two classifiers are best for this model