# <center><font color=blue>Credit Approval</font>

<font color=blue size=4>1-Needed Packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import sklearn.metrics as sk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
import os
from collections import defaultdict
from itertools import chain, combinations
# pip install datawig
import datawig

pd.options.mode.chained_assignment = None  # default='warn'

<font color=blue size=4>2- Load Data</font> 

In [7]:
## In case of using Google Colab
# from google.colab import drive
# drive.mount('/content/drive')
# file = '/content/drive/MyDrive/Colab Notebooks/crx.data.csv'
file = os.path.abspath("data/crx.data.csv")
dataset = pd.read_csv(file, header=None)
print('Dataset initial state:')
dataset.head()

Dataset initial state:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


<font color=blue size=4>3- Analyze Data</font> 

In [8]:
# Humanize data by giving variables working names based on the type of data.
dataset.columns = ["sex", "age", "debt", "marital_status", "customer_type", "edu_level", 
                   "race", "years_employed", "prior_default", "employed", "credit_score",
                   "driver_license", "citizen", "balance", "income", "is_approved"]

# Analyze data types based on type of data, as its not always correctly recognized.
floatType = 'float'
intType = 'int'
nonObjectDataTypes = {
   "age": floatType,
   "debt": floatType,
   "years_employed": floatType,
   "credit_score": intType,
   "balance": intType,
   "income": intType,
}

# Make data more suitable for learning by converting labels to 0,1
dataset['is_approved'] = LabelEncoder().fit_transform(dataset['is_approved'])

# Gather missing data columns and their data types
missingDataColumns = {}
continuousType = 'continuous'
categoricalType = 'categorical'
for column in dataset.columns:
    if (dataset[column] == '?').any():
        if column in nonObjectDataTypes:
            dataType = continuousType
        else:
            dataType = categoricalType
        missingDataColumns[column] = dataType
        # replace missing value with NaN
        dataset[column] = dataset[column].replace('?', np.nan)

# all variations will be collected here, beside saving in csv files
datasets = {
    "original": dataset
}

print('\nMissing data columns:')
missingDataColumns


Missing data columns:


  result = method(y)


{'sex': 'categorical',
 'age': 'continuous',
 'marital_status': 'categorical',
 'customer_type': 'categorical',
 'edu_level': 'categorical',
 'race': 'categorical',
 'balance': 'continuous'}

<font color=blue size=4>Missing data feasible handling options -there are non-feasible ones as well-</font>
    
<font color=grey size=3>a- Remove rows with missing data.</font>

<font color=grey size=3>b- Fill continuous data with columns mean/median values and categorical data with columns most frequent category.</font> 

<font color=grey size=3>c- Fill based on predictive models between mostly correlated columns.</font> 

<font color=grey size=3>d- Fill using unsupervised learning.</font> 

<font color=grey size=3>e- Fill using deep learning.</font> 

<font color=blue size=4>4- Remove rows with missing data</font>

In [9]:
# a. Remove rows with missing data
# Pros:
#  A model trained with the removal of all missing values creates a robust model.
# Cons:
#  Loss of a lot of information.
#  Works poorly if the percentage of missing values is excessive in comparison to the complete dataset.
datasetWithRemovedRows = dataset.copy(deep=True)
# remove rows where any column has value NaN
datasetWithRemovedRows = datasetWithRemovedRows.dropna()
# set data types correctly after removing rows with NaN
for column in nonObjectDataTypes:
    datasetWithRemovedRows[column] = datasetWithRemovedRows[column].astype(nonObjectDataTypes[column])
# save data to a new csv
datasetWithRemovedRows.to_csv("data/crx.data_removed_missing.csv", index=False, encoding='utf8')
datasets['after removing missing rows'] = datasetWithRemovedRows
# validate rows removal is done correctly
assert datasetWithRemovedRows.isnull().any().any() == False
assert dataset.shape[0] > datasetWithRemovedRows.shape[0]

print('\nContinuous columns info:')
datasetWithRemovedRows.describe()


Continuous columns info:


Unnamed: 0,age,debt,years_employed,credit_score,balance,income,is_approved
count,653.0,653.0,653.0,653.0,653.0,653.0,653.0
mean,31.503813,4.829533,2.244296,2.502297,180.359877,1013.761103,0.546708
std,11.838267,5.027077,3.37112,4.968497,168.296811,5253.278504,0.498195
min,13.75,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.58,1.04,0.165,0.0,73.0,0.0,0.0
50%,28.42,2.835,1.0,0.0,160.0,5.0,1.0
75%,38.25,7.5,2.625,3.0,272.0,400.0,1.0
max,76.75,28.0,28.5,67.0,2000.0,100000.0,1.0


<font color=blue size=4>5- Fill continuous data with columns mean values and categorical data with columns most frequent category</font>

In [10]:
# b. Fill continuous data with columns mean values and categorical data with columns most frequent category
# Pros:
#  Prevent data loss which results in deletion of rows or columns.
#  Works well with a small dataset and easy to implement.
# Cons:
#  Can cause data leakage.
#  Does not factor the covariance between features (for continuous ones).
datasetWithCalculatedRows = dataset.copy(deep=True)
for column in missingDataColumns:
    if missingDataColumns[column] == continuousType:
        imp = SimpleImputer(missing_values=np.nan, strategy='median')
        imp.fit(datasetWithRemovedRows[column].values.reshape(-1, 1))
        datasetWithCalculatedRows[column] = imp.transform(datasetWithCalculatedRows[column].values.reshape(-1, 1))
    else:
        imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        imp.fit(datasetWithRemovedRows[column].values.reshape(-1, 1))
        datasetWithCalculatedRows[column] = imp.transform(datasetWithCalculatedRows[column].values.reshape(-1, 1))
# set data types correctly after calculating rows with NaN
for column in nonObjectDataTypes:
    datasetWithCalculatedRows[column] = datasetWithCalculatedRows[column].astype(nonObjectDataTypes[column])
# save data to a new csv
datasetWithCalculatedRows.to_csv("data/crx.data_calculated_missing.csv", index=False, encoding='utf8')
datasets['after calculating missing rows'] = datasetWithCalculatedRows
# validate rows calculation is done correctly
assert datasetWithCalculatedRows.isnull().any().any() == False
assert dataset.shape[0] == datasetWithCalculatedRows.shape[0]

print('\nContinuous columns correlation:')
corr = datasetWithRemovedRows.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)


Continuous columns correlation:


Unnamed: 0,age,debt,years_employed,credit_score,balance,income,is_approved
age,1.0,0.22,0.42,0.2,-0.085,0.029,-0.18
debt,0.22,1.0,0.3,0.27,-0.22,0.12,-0.21
years_employed,0.42,0.3,1.0,0.33,-0.065,0.052,-0.33
credit_score,0.2,0.27,0.33,1.0,-0.12,0.058,-0.41
balance,-0.085,-0.22,-0.065,-0.12,1.0,0.073,0.085
income,0.029,0.12,0.052,0.058,0.073,1.0,-0.17
is_approved,-0.18,-0.21,-0.33,-0.41,0.085,-0.17,1.0


<font color=blue size=4>6- Fill based on predictive models between mostly correlated columns</font>

In [11]:
# c. Fill based on predictive models between mostly correlated columns
# Pros:
#  Gives a better result than earlier methods.
#  Takes into account the covariance between missing value column and other columns.
# Cons:
#  Considered only as a proxy for the true values.
datasetWithPredictedRows = dataset.copy(deep=True)
# Fill continuous-data-type columns based on predictive models between mostly correlated columns
continuousDataColumnsWithoutMissingData = np.append(np.setdiff1d(list(nonObjectDataTypes.keys()),list(missingDataColumns.keys())), 'is_approved')
continuousDataColumnsWithMissingData = np.intersect1d(list(nonObjectDataTypes.keys()), list(missingDataColumns.keys()))
for column in continuousDataColumnsWithMissingData:
    continuousDataColumnsWithoutMissingData = np.append(continuousDataColumnsWithoutMissingData,column)
    continuousDatasetWithRemovedRows = datasetWithRemovedRows[continuousDataColumnsWithoutMissingData]
    continuousDatasetWithPredictedRows = datasetWithPredictedRows[continuousDataColumnsWithoutMissingData]

    xTrain = continuousDatasetWithRemovedRows.drop(column, axis=1)
    yTrain = continuousDatasetWithRemovedRows[column]

    testData = continuousDatasetWithPredictedRows[continuousDatasetWithPredictedRows[column].isnull()]
    xTest = testData.drop(column, axis=1)

    model = LinearRegression()
    model.fit(xTrain, yTrain)
    yPred = model.predict(xTest)
    datasetWithPredictedRows[column][datasetWithPredictedRows[column].isnull()] = np.around(yPred, 2)

# Fill nominal-categoricies-data-type columns
# Apriori algorithm is a straight-forward Association rule mining technique
# to identify underlying relations between different items.
#
# Support in Apriori is the Fraction of transactions that contain an itemset.
# So, the support of item I is defined as the number of transactions containing I divided by the total number of transactions.
#
# Confidence in Apriori is how often items in Y appear in transactions that contain X.
# So, the confidence It’s calculated as the number of transactions containing X and Y divided by the number of transactions containing X.
#
# Frequent Item Set in Apriori is simply all the itemsets that the support satisfies the minimum support threshold.
#
# It’s a bottom-up approach. We started from every single item in the itemset list.
# Then, the candidates are generated by self-joining.
# We extend the length of the itemsets one item at a time.
# The subset test is performed at each stage and the itemsets that contain infrequent subsets are pruned.
# We repeat the process until no more successful itemsets can be derived from the data.
def getAboveMinSup(itemSet, itemSetList, minSup, globalItemSetWithSup):
    freqItemSet = set()
    localItemSetWithSup = defaultdict(int)

    for item in itemSet:
        for itemSet in itemSetList:
            if item.issubset(itemSet):
                globalItemSetWithSup[item] += 1
                localItemSetWithSup[item] += 1

    for item, supCount in localItemSetWithSup.items():
        support = float(supCount / len(itemSetList))
        if(support >= minSup):
            freqItemSet.add(item)

    return freqItemSet


def getUnion(itemSet, length):
    return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length])


def pruning(candidateSet, prevFreqSet, length):
    tempCandidateSet = candidateSet.copy()
    for item in candidateSet:
        subsets = combinations(item, length)
        for subset in subsets:
            # if the subset is not in previous K-frequent get, then remove the set
            if(frozenset(subset) not in prevFreqSet):
                tempCandidateSet.remove(item)
                break
    return tempCandidateSet

def powerset(s):
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)))

def associationRule(freqItemSet, itemSetWithSup, minConf):
    rules = []
    for k, itemSet in freqItemSet.items():
        for item in itemSet:
            subsets = powerset(item)
            for s in subsets:
                confidence = float(
                    itemSetWithSup[item] / itemSetWithSup[frozenset(s)])
                if(confidence > minConf):
                    rules.append([set(s), set(item.difference(s)), confidence])
    return rules


def getItemSetFromList(itemSetList):
    tempItemSet = set()

    for itemSet in itemSetList:
        for item in itemSet:
            tempItemSet.add(frozenset([item]))

    return tempItemSet

def apriori(itemSetList, minSup, minConf):
    C1ItemSet = getItemSetFromList(itemSetList)
    # Final result global frequent itemset
    globalFreqItemSet = dict()
    # Storing global itemset with support count
    globalItemSetWithSup = defaultdict(int)

    L1ItemSet = getAboveMinSup(
        C1ItemSet, itemSetList, minSup, globalItemSetWithSup)
    currentLSet = L1ItemSet
    k = 2

    # Calculating frequent item set
    while(currentLSet):
        # Storing frequent itemset
        globalFreqItemSet[k-1] = currentLSet
        # Self-joining Lk
        candidateSet = getUnion(currentLSet, k)
        # Perform subset testing and remove pruned supersets
        candidateSet = pruning(candidateSet, currentLSet, k-1)
        # Scanning itemSet for counting support
        currentLSet = getAboveMinSup(
            candidateSet, itemSetList, minSup, globalItemSetWithSup)
        k += 1

    rules = associationRule(globalFreqItemSet, globalItemSetWithSup, minConf)
    rules.sort(key=lambda x: x[2]*-1)

    return globalFreqItemSet, rules

categoricalDataColumnsWithoutMissingData = np.setdiff1d(np.setdiff1d(dataset.columns,list(nonObjectDataTypes.keys())),list(missingDataColumns.keys()))
categoricalDataColumnsWithMissingData = np.setdiff1d(list(missingDataColumns.keys()), list(nonObjectDataTypes.keys()))


for missingDataColumn in categoricalDataColumnsWithMissingData:
    categoricalDataColumnsWithoutMissingData = np.append(categoricalDataColumnsWithoutMissingData,missingDataColumn)
    categoricalDatasetWithRemovedRows = datasetWithRemovedRows[categoricalDataColumnsWithoutMissingData]
    categoricalDatasetWithPredictedRows = datasetWithPredictedRows[categoricalDataColumnsWithoutMissingData]
    # Make each column unique even if there are common values between columns like `t` and `f`
    for column in categoricalDatasetWithRemovedRows:
        categoricalDatasetWithRemovedRows[column] = [column + '_' + str(rowValue) for rowValue in categoricalDatasetWithRemovedRows[column]]
    # Use apriori to collect association rules
    globalFreqItemSet, rules = apriori(categoricalDatasetWithRemovedRows.to_numpy(), 0.5, 0.5)
    usefulRules = []
    columnUniqueValues = categoricalDatasetWithRemovedRows[column].unique()
    for rule in rules:
        ruleValues = list(rule[0]) + list(rule[1])
        for columnValue in columnUniqueValues:
            # keep rules that include the column with missing data
            if columnValue in ruleValues:
                usefulRules.append(ruleValues)
    # Rename prediction data similarly to training data to be able to match 
    for column in categoricalDatasetWithPredictedRows[categoricalDatasetWithPredictedRows[column].isnull()]:
        if missingDataColumn != column:
            categoricalDatasetWithPredictedRows[column] = [column + '_' + str(rowValue) for rowValue in categoricalDatasetWithPredictedRows[column]]
    # Predict missing data
    # Keep in mind, some rows might still have no clear association rules to predict.
    # We will use most frequent value for those.
    yPred = [];
    stillHaveMissingValues = False
    for row in categoricalDatasetWithPredictedRows[categoricalDatasetWithPredictedRows[column].isnull()].drop(column, axis=1).to_numpy():
        r = len(row)
        matched = False
        # collect all possible combinations of row starting from the whole row to single column values.
        # first match with association rules is used to predict missing column value.
        while(r >= 1 and matched == False):
            for combination in combinations(row, r):
                if matched == False:
                    for rule in usefulRules:
                        if matched == False and len(np.intersect1d(combination,rule)) == len(combination):
                            # get missing column value from association rule
                            matching = [s for s in rule if missingDataColumn in s]
                            # remove added unique string from matched value
                            yPred.append(matching[0].split('_')[-1])
                            # stop looking for matched values for that row
                            matched = True
            r -= 1
        # if no match found, keep value as `NaN`
        if matched == False:
            stillHaveMissingValues = True
            yPred.append(np.nan)
    datasetWithPredictedRows[missingDataColumn][datasetWithPredictedRows[missingDataColumn].isnull()] = yPred
    if stillHaveMissingValues == True:
        imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        imp.fit(datasetWithPredictedRows[missingDataColumn].values.reshape(-1, 1))
        datasetWithPredictedRows[missingDataColumn] = imp.transform(datasetWithPredictedRows[missingDataColumn].values.reshape(-1, 1))
# set data types correctly after predicting rows with NaN
for column in nonObjectDataTypes:
    datasetWithPredictedRows[column] = datasetWithPredictedRows[column].astype(nonObjectDataTypes[column])
# save data to a new csv
datasetWithPredictedRows.to_csv("data/crx.data_predicted_missing.csv", index=False, encoding='utf8')
datasets['after predicting missing rows'] = datasetWithPredictedRows
# validate rows prediction is done correctly
assert datasetWithPredictedRows.isnull().any().any() == False
assert dataset.shape[0] == datasetWithPredictedRows.shape[0]

<font color=blue size=4>7- Fill using unsupervised learning.</font>

In [12]:
# d. Fill using unsupervised learning
# Pros:
# Support more non-linearity between data, as it doesn't need strong correlation  
#  Takes into account the covariance between missing value column and other columns.
# Cons:
#  Considered only as a proxy for the true values.
datasetWithUnsupervisedLearntRows = dataset.copy(deep=True)

continuousDataColumns = list(nonObjectDataTypes.keys())
categoricalDataColumns = np.setdiff1d(dataset.columns,continuousDataColumns)

categoriesMap = {}
# Convert categorical columns data to ordinal integers
for column in categoricalDataColumns:
    columnUniqueValues = datasetWithRemovedRows[column].unique()
    categoryMap = ({ index: v for index, v in enumerate(columnUniqueValues) })
    keyList = list(categoryMap.keys())
    valList = list(categoryMap.values())
    categoriesMap[column] = {'keys': keyList, 'values': valList, 'ordinalMap': categoryMap}
    datasetWithUnsupervisedLearntRows[column][datasetWithUnsupervisedLearntRows[column] == datasetWithUnsupervisedLearntRows[column]] = [keyList[valList.index(rowValue)] for rowValue in datasetWithUnsupervisedLearntRows[column] if rowValue == rowValue]
    
imputer = KNNImputer(n_neighbors=2, weights="uniform")
transformedData = imputer.fit_transform(datasetWithUnsupervisedLearntRows)
rotatedTransformedData = transformedData.T
for index, column in enumerate(datasetWithUnsupervisedLearntRows.columns):
    datasetWithUnsupervisedLearntRows[column] = rotatedTransformedData[index]

# Convert ordinal integers to original categorical columns data
for column in categoricalDataColumns:
    datasetWithUnsupervisedLearntRows[column] = [categoriesMap[column]['ordinalMap'][round(rowValue,0)] for rowValue in datasetWithUnsupervisedLearntRows[column]]

# set data types correctly after unsupervised learning rows with NaN
for column in nonObjectDataTypes:
    datasetWithUnsupervisedLearntRows[column] = datasetWithUnsupervisedLearntRows[column].astype(nonObjectDataTypes[column])
# save data to a new csv
datasetWithUnsupervisedLearntRows.to_csv("data/crx.data_unsupervised_learnt_missing.csv", index=False, encoding='utf8')
datasets['after unsupervised learning missing rows'] = datasetWithUnsupervisedLearntRows
# validate rows unsupervised learning is done correctly
assert datasetWithUnsupervisedLearntRows.isnull().any().any() == False
assert dataset.shape[0] == datasetWithUnsupervisedLearntRows.shape[0]

<font color=blue size=4>8- Fill using deep learning.</font>

In [15]:
# e. Fill using deep learning
# Pros:
#  Quite accurate compared to other methods.
#  It supports both CPUs and GPUs.
# Cons:
#  Still can be quite slow with large datasets.
datasetWithDeepLearntRows = dataset.copy(deep=True)

continuousDataColumns = list(nonObjectDataTypes.keys())
categoricalDataColumns = np.setdiff1d(dataset.columns,continuousDataColumns)

categoriesMap = {}
# Convert categorical columns data to ordinal integers
for column in categoricalDataColumns:
    columnUniqueValues = datasetWithRemovedRows[column].unique()
    categoryMap = ({ index: v for index, v in enumerate(columnUniqueValues) })
    keyList = list(categoryMap.keys())
    valList = list(categoryMap.values())
    categoriesMap[column] = {'keys': keyList, 'values': valList, 'ordinalMap': categoryMap}
    datasetWithDeepLearntRows[column][datasetWithDeepLearntRows[column] == datasetWithDeepLearntRows[column]] = [keyList[valList.index(rowValue)] for rowValue in datasetWithDeepLearntRows[column] if rowValue == rowValue]

columns = dataset.columns.values
for column in columns:
    datasetWithDeepLearntRows[column] = datasetWithDeepLearntRows[column].astype(floatType)


for column in missingDataColumns:
    df_train = datasetWithDeepLearntRows.dropna()
    df_test = datasetWithDeepLearntRows[datasetWithDeepLearntRows[column].isnull()]
    # Initialize a SimpleImputer model
    imputer = datawig.SimpleImputer(
        input_columns=np.delete(columns,np.where(columns == column)), # column(s) containing information about the column we want to impute
        output_column= column, # the column we'd like to impute values for
        output_path = 'imputer_model' # stores model data and metrics
        )

    # Fit an imputer model on the train data
    imputer.fit(train_df=df_train, num_epochs=50)

    # Impute missing values and return original dataframe with predictions
    imputed = imputer.predict(df_test)
    datasetWithDeepLearntRows[column][datasetWithDeepLearntRows[column].isnull()] = np.around(imputed[column+'_imputed'], 2)

# Convert ordinal integers to original categorical columns data
for column in categoricalDataColumns:
    datasetWithDeepLearntRows[column] = [categoriesMap[column]['ordinalMap'][np.abs(round(rowValue,0))] for rowValue in datasetWithDeepLearntRows[column]]

# set data types correctly after deep learning rows with NaN
for column in nonObjectDataTypes:
    datasetWithDeepLearntRows[column] = datasetWithDeepLearntRows[column].astype(nonObjectDataTypes[column])
# save data to a new csv
datasetWithDeepLearntRows.to_csv("data/crx.data_deep_learnt_missing.csv", index=False, encoding='utf8')
datasets['after deep learning missing rows'] = datasetWithDeepLearntRows
# validate rows deep learning is done correctly
assert datasetWithDeepLearntRows.isnull().any().any() == False
assert dataset.shape[0] == datasetWithDeepLearntRows.shape[0]

  return np.log(probas)
  return np.log(probas)
  return np.log(probas)
  return np.log(probas)
  return np.log(probas)


<font color=blue size=4>9- Normalize Data.</font>