In [4]:
#Commercial banks receive a lot of applications for credit cards. 
#Many of them get rejected for many reasons, like high loan balances, low income levels, 
#or too many inquiries on an individual's credit report, for example. 
#Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). 
#Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. 
#In this notebook, we will build an automatic credit card approval predictor using machine learning techniques, 
#just like the real banks do.


# Import pandas
#import pandas as pd

# Load dataset
#cc_apps = pd.read_csv("datasets/cc_approvals.data", header=None)

# Inspect data
#cc_apps.head()

In [5]:



# Print summary statistics
#cc_apps_description = cc_apps.describe()
#print(cc_apps_description)

print('\n')

# Print DataFrame information
#cc_apps_info = cc_apps.info()
#print(cc_apps_info)

#print('\n')

# Inspect missing values in the dataset
#cc_apps.tail(17)





In [6]:
# Import train_test_split
#from sklearn.model_selection import train_test_split

# Drop the features 11 and 13
#cc_apps = cc_apps.drop([11, 13], axis=1)

# Split into train and test sets
#cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)

In [7]:
# Import numpy
#import numpy as np

# Replace the '?'s with NaN in the train and test sets
#cc_apps_train = cc_apps_train.replace('?', np.NaN)
#cc_apps_test = cc_apps_test.replace('?', np.NaN)

In [8]:
# Impute the missing values with mean imputation
#cc_apps_train.fillna(cc_apps_train.mean(), inplace=True)
#cc_apps_test.fillna(cc_apps_train.mean(), inplace=True)

# Count the number of NaNs in the datasets and print the counts to verify
#print(cc_apps_train.isnull().sum())
#print(cc_apps_test.isnull().sum())

In [9]:
# Iterate over each column of cc_apps_train
#for col in cc_apps_train.columns:
    # Check if the column is of object type
#    if cc_apps_train[col].dtypes == 'object':
        # Impute with the most frequent value
#        cc_apps_train = cc_apps_train.fillna(cc_apps_train[col].value_counts().index[0])
#        cc_apps_test = cc_apps_test.fillna(cc_apps_train[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
#print(cc_apps_train.isnull().sum())
#print(cc_apps_test.isnull().sum())

In [10]:
# Convert the categorical features in the train and test sets independently
#cc_apps_train = pd.get_dummies(cc_apps_train)
#cc_apps_test = pd.get_dummies(cc_apps_test)

# Reindex the columns of the test set aligning with the train set
#cc_apps_test = cc_apps_test.reindex(columns=cc_apps_train.columns, fill_value=0)

In [11]:
# Import MinMaxScaler
#from sklearn.preprocessing import MinMaxScaler

# Segregate features and labels into separate variables
#X_train, y_train = cc_apps_train.iloc[:, :-1].values, cc_apps_train.iloc[:, [-1]].values
#X_test, y_test = cc_apps_test.iloc[:, :-1].values, cc_apps_test.iloc[:, [-1]].values

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
#scaler = MinMaxScaler(feature_range=(0, 1))
#rescaledX_train = scaler.fit_transform(X_train)
#rescaledX_test = scaler.transform(X_test)

In [13]:
#Which model should we pick? 
#A question to ask is: are the features that affect the credit card approval decision process correlated with each other? 
#Although we can measure correlation, that is outside the scope of this notebook, 
#so we'll rely on our intuition that they indeed are correlated for now. 
#Because of this correlation, we'll take advantage of the fact that generalized linear models perform well in these cases. 
#Let's start our machine learning modeling with a Logistic Regression model (a generalized linear model).

# Import LogisticRegression
#from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
#logreg = LogisticRegression()

# Fit logreg to the train set
#logreg.fit(rescaledX_train,y_train)

In [14]:
#But how well does our model perform?
#We will now evaluate our model on the test set with respect to classification accuracy. 
#But we will also take a look the model's confusion matrix. 
#In the case of predicting credit card applications, 
#it is important to see if our machine learning model is equally capable of predicting approved and denied status, 
#in line with the frequency of these labels in our original dataset. 
#If our model is not performing well in this aspect, 
#then it might end up approving the application that should have been approved. 
#The confusion matrix helps us to view our model's performance from these aspects.


# Import confusion_matrix
#from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
#y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
#print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test,y_test))

# Print the confusion matrix of the logreg model
#confusion_matrix(y_test,y_pred)

In [15]:
#Our model was pretty good! In fact it was able to yield an accuracy score of 100%.
#For the confusion matrix, 
#the first element of the of the first row of the confusion matrix denotes the true negatives 
#meaning the number of negative instances (denied applications) predicted by the model correctly. 
#And the last element of the second row of the confusion matrix denotes the true positives 
#meaning the number of positive instances (approved applications) predicted by the model correctly.
#But if we hadn't got a perfect score what's to be done?. 
#We can perform a grid search of the model parameters to improve the model's ability to predict credit card approvals.
#scikit-learn's implementation of logistic regression consists of different hyperparameters but we will grid search over the following two:
#tol
#max_iter

# Import GridSearchCV
#from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
#tol = [0.01, 0.001 ,0.0001]
#max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are the corresponding values
#param_grid = dict(tol=tol, max_iter=max_iter)


In [17]:
#We have defined the grid of hyperparameter values and converted them into a single dictionary format which GridSearchCV() expects as one of its parameters. 
#Now, we will begin the grid search to see which values perform best.
#We will instantiate GridSearchCV() with our earlier logreg model with all the data we have. 
#We will also instruct GridSearchCV() to perform a cross-validation of five folds.
#We'll end the notebook by storing the best-achieved score and the respective best parameters.

# Instantiate GridSearchCV with the required parameters
#grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Fit grid_model to the data
#grid_model_result = grid_model.fit(rescaledX_train, y_train)

# Summarize results
#best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
#print("Best: %f using %s" % (best_score, best_params))

# Extract the best model and evaluate it on the test set
#best_model = grid_model_result.best_estimator_
#print("Accuracy of logistic regression classifier: ", best_model.score(rescaledX_test,y_test))
