Warning: If you attempt to run this file on a local machine, it will most likely take more than a day before it finishes.  Run it on Cloud is much better.

# Importing packages

In [0]:
# Importing necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer, precision_score, recall_score,f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, StratifiedKFold, RandomizedSearchCV
import statsmodels.api as sm

# Dimensionality Reduction Packages
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import KernelPCA

# Models packages
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# writing to CSV  
import csv 

# 1. Data Exploration

In [0]:
# Loading Dataset
df = pd.read_csv('UCI_Credit_Card.csv')

pd.set_option('display.max_columns', 999)
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


##### There are total of 25 columns with numeric values. 

### 1.1 Features Reference

__ID:__ ID of each customer
<br>
__LIMIT_BAL:__ Amount of the given credit (NT dollar). It includes both the individual consumer credit and his/her family (supplementary) credit.
<br>
__SEX:__ Gender (__1 =__ male, __2 =__ female)
<br>
__EDUCATION:__ (__1 =__ graduate school, __2 =__ university, __3 =__ high school, __4 =__ others, __5 =__ unknown, __6 =__ unknown, __0 =__ unknown)
<br>
__MARRIAGE:__ (__1 =__ married, __2 =__ single, __3 =__ others, __0 =__ unknown)
<br>
__AGE:__ Age (year)
<br>
__PAY_0:__ Repayment status in September 2005 (__-2 =__ no consumption, __-1 =__ paid in full, __0 =__ use of revolving credit, __1 =__ payment delay for one month, __2 =__ payment delay for two months, __3 =__ payment delay for three months ... __8 =__ payment delay for eight months, __9 =__ payment delay for nine months and above)
<br>
__PAY_2:__ Repayment status in August 2005 (__scale same as above__)
<br>
__PAY_3:__ Repayment status in July 2005 (__scale same as above__)
<br>
__PAY_4:__ Repayment status in June 2005 (__scale same as above__)
<br>
__PAY_5:__ Repayment status in May 2005 (__scale same as above__)
<br>
__PAY_6:__ Repayment status in April 2005 (__scale same as above__)
<br>
__BILL_AMT1:__ Amount of bill statement in September, 2005 (NT dollar)
<br>
__BILL_AMT2:__ Amount of bill statement in August, 2005 (NT dollar)
<br>
__BILL_AMT3:__ Amount of bill statement in July, 2005 (NT dollar)
<br>
__BILL_AMT4:__ Amount of bill statement in June, 2005 (NT dollar)
<br>
__BILL_AMT5:__ Amount of bill statement in May, 2005 (NT dollar)
<br>
__BILL_AMT6:__ Amount of bill statement in April, 2005 (NT dollar)
<br>
__PAY_AMT1:__ Amount of previous payment in September, 2005 (NT dollar)
<br>
__PAY_AMT2:__ Amount of previous payment in August, 2005 (NT dollar)
<br>
__PAY_AMT3:__ Amount of previous payment in July, 2005 (NT dollar)
<br>
__PAY_AMT4:__ Amount of previous payment in June, 2005 (NT dollar)
<br>
__PAY_AMT5:__ Amount of previous payment in May, 2005 (NT dollar)
<br>
__PAY_AMT6:__ Amount of previous payment in April, 2005 (NT dollar)
<br>
__default.payment.next.month:__ Default payment (__1 =__ yes, __0 =__ no)

# 1.2 Dataset Information

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
ID                            30000 non-null int64
LIMIT_BAL                     30000 non-null float64
SEX                           30000 non-null int64
EDUCATION                     30000 non-null int64
MARRIAGE                      30000 non-null int64
AGE                           30000 non-null int64
PAY_0                         30000 non-null int64
PAY_2                         30000 non-null int64
PAY_3                         30000 non-null int64
PAY_4                         30000 non-null int64
PAY_5                         30000 non-null int64
PAY_6                         30000 non-null int64
BILL_AMT1                     30000 non-null float64
BILL_AMT2                     30000 non-null float64
BILL_AMT3                     30000 non-null float64
BILL_AMT4                     30000 non-null float64
BILL_AMT5                     30000 non-null float64
BILL_AMT6   

##### There are total of 30,000 rows and 25 columns. Columns have integer and float data types. There are no missing values in the dataset.

In [0]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,30000.0,15000.5,8660.398374,1.0,7500.75,15000.5,22500.25,30000.0
LIMIT_BAL,30000.0,167484.322667,129747.661567,10000.0,50000.0,140000.0,240000.0,1000000.0
SEX,30000.0,1.603733,0.489129,1.0,1.0,2.0,2.0,2.0
EDUCATION,30000.0,1.853133,0.790349,0.0,1.0,2.0,2.0,6.0
MARRIAGE,30000.0,1.551867,0.52197,0.0,1.0,2.0,2.0,3.0
AGE,30000.0,35.4855,9.217904,21.0,28.0,34.0,41.0,79.0
PAY_0,30000.0,-0.0167,1.123802,-2.0,-1.0,0.0,0.0,8.0
PAY_2,30000.0,-0.133767,1.197186,-2.0,-1.0,0.0,0.0,8.0
PAY_3,30000.0,-0.1662,1.196868,-2.0,-1.0,0.0,0.0,8.0
PAY_4,30000.0,-0.220667,1.169139,-2.0,-1.0,0.0,0.0,8.0


### 1.4 Visualizations

In [0]:
# Plotting distribution of class label
plt.hist(df['default.payment.next.month'], bins=[-.5,.5,1.5], ec="k")
plt.xticks((0,1))
plt.title('Distribution of Class Label. \n Not Default = 0 || Default = 1')
plt.ylabel('Count')
plt.show()

##### As we see, this is an imbalanced dataset with only 22% Default cases and 78% Non-default cases. It means that if we make a model that always predicts 0 (non-default), we still get 78% accuracy rate. Of course, this kind of model would be useless for the banks to predict the defaulters next month. 

In [0]:
df.SEX.value_counts().plot(kind='bar')
plt.title('SEX \n 1 = male || 2 = female')
plt.show()

##### There are more felmales than males in the dataset. We can change the values to binary values later (One Hot Encoding and drop one). We will address this in Data Preparation section of this notebook.


In [0]:
df.MARRIAGE.value_counts().plot(kind='bar')
plt.title('MARRIAGE \n 0 = unknown || 1 = married || 2 = single || 3 = others')
plt.show()

##### There are only few cases of labels '0' and '3'. We can combine the rare cases as a single value later. We will address this in Data Preparation section of this notebook.

In [0]:
df.EDUCATION.value_counts().plot(kind='bar')
plt.title('EDUCATION \n 0 = unknown || 1 = graduate school || 2 = univeristy || 3 = high school || 4 = others || 5 and 6 = unknown')
plt.show()

##### There are few cases of undocumented values in "EDUCATION" column. We can add up all these instances and label them as a single value "others". For instance we can merge labels "4", "5", "6" and "0". We will address this in Data Preparation section of this notebook.

In [0]:
# Plotting distribution of class label
plt.hist(df['LIMIT_BAL'], bins=60)
plt.title('Distribution of Credit Limits')
plt.ylabel('Count')
plt.xlabel('Credit Limit')
plt.show()

##### This is skewed to the right. Most of the customers are issued loan amount of 50,000 NT Dollars (around 2,200 US Dollars today with inflation). And it appears to be there are some outliers such as amount of 1,000,000 NT issued to only 1 customer. 

In [0]:
# SEX: Converting value "2" to "0" so that we have only binary values. 
#      After conversion 0 represents "female", and 1 represents "male".

df["SEX"].replace({2: 0}, inplace=True)

# MARRIAGE: Since there are only few values of "3" and "0", we are converting value "3" to "0". 
#           After conversion, "0" will represent "others".

df["MARRIAGE"].replace({3: 0}, inplace=True)

# EDUCATION: Converting values "4", "5", and "6" to "0". After conversion "0" will represent "others"
df["EDUCATION"].replace({4: 0}, inplace=True)
df["EDUCATION"].replace({5: 0}, inplace=True)
df["EDUCATION"].replace({6: 0}, inplace=True)

# OneHot Encoding Columns MARRIAGE and EDUCATION
df=pd.get_dummies(df, prefix=['EDUCATION'], columns=['EDUCATION'])
df=pd.get_dummies(df, prefix=['MARRIAGE'], columns=['MARRIAGE'])

# Changing class label column name to 'Class'. Original name is too long to type later.
df=df.rename(columns = {"default.payment.next.month": "Class"})

# We are dropping the rows where BILL_AMT and PAY_AMT columns have all 0 values and yet class label is 1. This does 
# not make much sense. How can a customer default while his/her Bill amount for a certain month is 0?
for i in df.ID:
    if df.BILL_AMT1[i-1]==0 and \
    df.BILL_AMT2[i-1]==0 and \
    df.BILL_AMT3[i-1]==0 and \
    df.BILL_AMT4[i-1]==0 and \
    df.BILL_AMT5[i-1]==0 and \
    df.BILL_AMT6[i-1]==0 and \
    df.PAY_AMT1[i-1]==0 and \
    df.PAY_AMT2[i-1]==0 and \
    df.PAY_AMT3[i-1]==0 and \
    df.PAY_AMT4[i-1]==0 and \
    df.PAY_AMT5[i-1]==0 and \
    df.PAY_AMT6[i-1]==0 and \
    df.Class[i-1]==1:
        df.drop(i-1, inplace=True)

# Finally, we are reorganizing the columns and drop some columns which we do not need in our dataset. For instance, we do not
# need ID column and we can remove one of the one-hot encoded column from each categorical feature.

df = df[['LIMIT_BAL', 'SEX', 'EDUCATION_1', 'EDUCATION_2', 'EDUCATION_3', 'MARRIAGE_1', 'MARRIAGE_2', 'AGE',\
         'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', \
        'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', \
        'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'Class']]

# Excluded columns from Dataset: ID, EDUCATION_0, MARRIAGE_0

In [0]:
# Let's see how it looks like
print(df.head())

   LIMIT_BAL  SEX  EDUCATION_1  EDUCATION_2  EDUCATION_3  MARRIAGE_1  \
0    20000.0    0            0            1            0           1   
1   120000.0    0            0            1            0           0   
2    90000.0    0            0            1            0           0   
3    50000.0    0            0            1            0           1   
4    50000.0    1            0            1            0           1   

   MARRIAGE_2  AGE  PAY_0  PAY_2  PAY_3  PAY_4  PAY_5  PAY_6  BILL_AMT1  \
0           0   24      2      2     -1     -1     -2     -2     3913.0   
1           1   26     -1      2      0      0      0      2     2682.0   
2           1   34      0      0      0      0      0      0    29239.0   
3           0   37      0      0      0      0      0      0    46990.0   
4           0   57     -1      0     -1      0      0      0     8617.0   

   BILL_AMT2  BILL_AMT3  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  \
0     3102.0      689.0        0.0  

In [0]:
# First, let's define X and y. 
X = df.iloc[:,0:-1].values
y = df.iloc[:, -1].values

In [0]:
sc_X = StandardScaler()
X = sc_X.fit_transform(X)

In [0]:
import random

# Create random_state  
random_state = random.randint(0,101)

# Splitting the dataframe into Train (70%) and Test (30%) Sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = random_state)

# X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.transform(X_test)

In [0]:
#Regularizing how many ways to split the data during Randomized Search
cvNumber=5;
#Defining Accuracy Score as the criteria
ScoreCriteria='Accuracy Score'
file_name='ScoreTracker.csv'
Column_Header=['Data Extraction/Selection Method', 'Model Name', ScoreCriteria, 'Hyperparameters Setting']

In [0]:
with open(file_name, 'w', newline='') as csvfile: 
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(Column_Header)

# Supporting Function Section

In [0]:
# Backward Elimination Function
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        obj_OLS = sm.OLS(y, x).fit()
        maxVar = max(obj_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (obj_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    print(obj_OLS.summary())
    return x

In [0]:
#PCA
def myPCA(X_Values):
    #Creating the pca based on the criteria that it retains 99% of the variance within the data.
    pcaObj = PCA(n_components=0.99, whiten=True)
    X_Transformed = pcaObj.fit_transform(X_Values)
    return X_Transformed

In [0]:
#LDA
def myLDA(X_Values):
    ldaObj = LDA(n_components=None)
    X_Transformed = ldaObj.fit_transform(X_Values,y)
    return X_Transformed

In [0]:
#Kernel PCA.  DO NOT RUN THIS, unless you have 64GB+ Ram it will crash, and it will be super slow!!!

def mykernelPCA(X_Values):
    kernelPCAObj=KernelPCA(n_components=2, kernel='rbf')
    X_Transformed = kernelPCAObj.fit_transform(X_Values);
    return X_Transformed

In [0]:
#Function for writing files.
def writingFiles(DataProcess, ModelName,Hyperparameters, Scores):
    rows = [DataProcess, ModelName, Hyperparameters ,str(Scores['ac_score']),str(Scores['rec_score']),str(Scores['pre_score']),str(Scores['F1_score'])]
    FinalModelWriter.writerow(rows)

In [0]:
#Function for doing kFold.  
def kFoldAccuracy(estimator_obj, X_data, y_data):
    #Making the kFold
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    
    #Define evaluating parameters
    Scores = {
        'ac_score': make_scorer(accuracy_score), #These are callback functions
        'pre_score': make_scorer(precision_score),
        'rec_score': make_scorer(recall_score),
        'F1_score': make_scorer(f1_score) 
    } 
    
    #Make prediction
    modelScore = cross_validate(estimator=estimator_obj, X=X_data, y=y_data, cv=kf, scoring=Scores)
    
    for i in Scores:
        testScore = 'test_'+i
        Scores[i] = np.mean(modelScore[testScore])

    return Scores

In [0]:
#Function for doing RandomizedSearchCV

def randomSearch(X_Data,estimatorObj, parameters):
    #Scores=['accuracy', 'precision', 'recall', 'f1'] //may put this in later...
    rd_sr = RandomizedSearchCV(estimator=estimatorObj, param_distributions=parameters, scoring='accuracy', cv=cvNumber, n_jobs=-1)
    rd_sr.fit(X_Data,y)
    return [rd_sr.best_params_, round(rd_sr.best_score_,4)]

In [0]:
#Output to CSV file and also save it to the dictionary here.
def writeFiles(DataProcessMethod, rowDict):
    dictKey=DataProcessMethod + '_' + str(rowDict['name']) + '_' + str(rowDict['Param_Settings'])
    resultDict[dictKey]=rowDict[ScoreCriteria]
    
    with open(file_name, 'a', newline='') as csvfile:
        filewriter=csv.writer(csvfile)
        rows = [DataProcessMethod, rowDict['name'], str(rowDict[ScoreCriteria]), rowDict['Param_Settings']]
        filewriter.writerow(rows)

Using Regression_And_WriteFiles to automate the process

In [0]:
def Regression_And_WriteFiles(X_Data, DataProcessName, RegressionType):
    
    #1: Logistic Regression
    #2: kNN
    #3: SVM
    #4: Naive Bayes
    #5: Decision Tree
    #6: Random Forest
    if RegressionType==1: #Python doesn't seem to have switch statements.
        row=Logistic_Regression(X_Data)
        writeFiles(DataProcessName, row)
    elif RegressionType==2:
        row=kNN_Classifier(X_Data)
        writeFiles(DataProcessName, row)
    elif RegressionType==3:
        #Skipping this model because it simply takes way too long.  VM ran for 5 hours without getting results on this one.
        if DataProcessName=='LDA': 
            return;
        else:
            row=SVM_Classifier(X_Data)
            writeFiles(DataProcessName, row)
    elif RegressionType==4:
        row=NaiveBaynes(X_Data)
        writeFiles(DataProcessName, row)
    elif RegressionType==5:
        row=DecisionTreeClassifier1(X_Data)
        writeFiles(DataProcessName, row)
    else:
        row=RandomForestClassifier1(X_Data)
        writeFiles(DataProcessName, row)


# Various machine learning models and hyperparameters tuning

In [0]:
def Logistic_Regression(X):
    regObj = LogisticRegression()
    
    random_param_lr = {
        'C':[0.001, 0.01, 0.09, 1, 5, 10, 25],
        'class_weight': [None, 'balanced', {0:1, 1:2}, {0:1, 1:3}, {0:2, 1:1}, {0:3, 1:1}, {0:2, 1:3}, {0:3, 1:2}]
    }
    
    fileRow={}
    fileRow['name'] = "Logistic Regression"
    best_parameters, score = (randomSearch(X,regObj, random_param_lr))
    fileRow['Param_Settings'] = best_parameters
    fileRow[ScoreCriteria] = score
    return fileRow

In [0]:
def kNN_Classifier(X):
    kNNObj = KNeighborsClassifier()
    
    random_param = {
        'n_neighbors': [2, 5, 10, 15, 20, 25],
        'metric': ['minkowski', 'manhattan', 'euclidean'] #this probably isn't needed
    }
    
    fileRow={}
    fileRow['name'] = "kNN"
    best_parameters, score = (randomSearch(X,kNNObj, random_param))
    fileRow['Param_Settings'] = best_parameters
    fileRow[ScoreCriteria] = score
    return fileRow

In [0]:
def SVM_Classifier(X):
    SVMobj = SVC()
    
    random_param = {
        'kernel': ['linear', 'rbf', 'poly'],
        'degree': [2, 3, 4, 5, 8]
    }
    
    fileRow={}
    fileRow['name'] = "SVM"
    best_parameters, score = (randomSearch(X,SVMobj, random_param))
    fileRow['Param_Settings'] = best_parameters
    fileRow[ScoreCriteria] = score
    return fileRow

In [0]:
def NaiveBaynes(X):
    NaiveBaynes = GaussianNB()
    
    random_param = {
    }
    
    fileRow={}
    fileRow['name'] = "Naive Baynes"
    best_parameters, score = (randomSearch(X,NaiveBaynes, random_param))
    fileRow['Param_Settings'] = best_parameters
    fileRow[ScoreCriteria] = score
    return fileRow

In [0]:
def DecisionTreeClassifier1(X):
    decTreeObj = DecisionTreeClassifier()
    
    random_param = {
        'criterion': ['gini', 'entropy'],
        'min_samples_leaf' : [1, 10, 20, 30, 40],
        'min_samples_split' : [2, 10, 16, 18],
        'class_weight': [None, {0:1, 1:2}, {0:1, 1:3}, {0:2, 1:1}, {0:3, 1:1}, {0:2, 1:3}]
    }
    
    fileRow={}
    fileRow['name'] = "Decision Tree"
    best_parameters, score = (randomSearch(X,decTreeObj, random_param))
    fileRow['Param_Settings'] = best_parameters
    fileRow[ScoreCriteria] = score
    return fileRow

In [0]:
def RandomForestClassifier1(X):
    RanForestObj = RandomForestClassifier()
    
    random_param = {
        'n_estimators': [50, 100, 150],
        'criterion': ['gini', 'entropy'],
        'min_samples_leaf' : [1, 20, 30, 40],
        'min_samples_split' : [2, 10, 16],
        'class_weight': [None, {0:1, 1:2}, {0:1, 1:3}, {0:2, 1:1}, {0:2, 1:3}], 
        'bootstrap': [True, False]
    }
    
    fileRow={}
    fileRow['name'] = "Random Forest"
    best_parameters, score = (randomSearch(X,RanForestObj, random_param))
    fileRow['Param_Settings'] = best_parameters
    fileRow[ScoreCriteria] = score
    return fileRow

# Making the model

In [0]:
#Getting transformed data using LDA, PCA, as well as eliminating selection using backward elimination.
X_PCA = myPCA(X) #Can't name them lda or pca otherwise it will conflict with the actual method.
X_LDA = myLDA(X)

#Add a column of 1 to X before backward elimination
X1 = np.append(arr = np.ones([X.shape[0],1]).astype(int), values = X, axis = 1)
X_backward=backwardElimination(X1,0.05)
#myX_kernel=(X) #Do not run this...

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.137
Model:                            OLS   Adj. R-squared:                  0.136
Method:                 Least Squares   F-statistic:                     361.2
Date:                Tue, 05 May 2020   Prob (F-statistic):               0.00
Time:                        00:04:50   Log-Likelihood:                -13459.
No. Observations:               29701   AIC:                         2.695e+04
Df Residuals:                   29687   BIC:                         2.706e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2134      0.002     96.569      0.0

In [0]:
resultDict={}

for i in ['pca', 'lda', 'back']:
    #1: Logistic Regression
    #2: kNN
    #3: SVM
    #4: Naive Bayes
    #5: Decision Tree
    #6: Random Forest
    for j in range(1,7):
        if (i == 'pca'):
            Regression_And_WriteFiles(X_PCA, 'PCA', j)
        elif (i == 'lda'):
            Regression_And_WriteFiles(X_LDA, 'LDA', j)
        elif (i=='back'):
            Regression_And_WriteFiles(X_backward, 'Backward Elimination', j)
            



In [0]:
for i in resultDict:
    print(i + ":" + str(resultDict[i]))

PCA_Random Forest_{'class_weight': None, 'bootstrap': True, 'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 16, 'n_estimators': 150}:0.8239
Backward Elimination_Random Forest_{'class_weight': None, 'bootstrap': True, 'criterion': 'entropy', 'min_samples_leaf': 30, 'min_samples_split': 16, 'n_estimators': 150}:0.8279
Backward Elimination_SVM_{'degree': 2, 'kernel': 'poly'}:0.8274
Backward Elimination_Naive Baynes_{}:0.8038
Backward Elimination_kNN_{'n_neighbors': 20, 'metric': 'manhattan'}:0.8222
LDA_Random Forest_{'class_weight': {0: 2, 1: 3}, 'bootstrap': False, 'criterion': 'entropy', 'min_samples_leaf': 40, 'min_samples_split': 2, 'n_estimators': 150}:0.8163
PCA_Naive Baynes_{}:0.7239
LDA_kNN_{'n_neighbors': 20, 'metric': 'minkowski'}:0.8167
PCA_Decision Tree_{'min_samples_leaf': 40, 'min_samples_split': 16, 'class_weight': {0: 2, 1: 1}, 'criterion': 'gini'}:0.8102
LDA_Logistic Regression_{'class_weight': {0: 2, 1: 3}, 'C': 0.09}:0.8224
LDA_Naive Baynes_{}:0.8198

# Looking at the results, it can be seen that the following 3 models have the highest accuracies

1: Backward Elimination - Random Forest(Class_weight:none, bootstrap:true, 'criterion: entropy, min_sample_leaf:30, min_sample_split:16, n_estimators:150)

2: Backward Elimination - SVC(kernel=poly, degree=2)

3: PCA - SVC(kernel='rbf')

For each of the models, as their accuracies are very close, we decided to use kFold with 10 Folds to evaluate each of them based on accuracy, precision, recall and F1 Score

In [0]:
RfModel = RandomForestClassifier(class_weight=None, bootstrap=True, criterion='entropy', min_samples_leaf=30, min_samples_split=16, n_estimators=150)
RfScore = kFoldAccuracy(estimator_obj=RfModel, X_data=X_backward, y_data=y)

SVMPolyModel = SVC(kernel='poly', degree=2)
SVMPolyScore=kFoldAccuracy(estimator_obj=SVMPolyModel, X_data=X_backward, y_data=y)



NameError: name 'SVM' is not defined

In [0]:
SVMRbf=SVC()
SVMScore=kFoldAccuracy(estimator_obj=SVMRbf, X_data=X_PCA, y_data=y)



In [0]:
Headings = ["Data Extraction/Selection Method", "Model Method", "Hyperparameters", "Accuracy Score", "Recall Score", "Precision Score", "F1 Score"]
    
with open('FinalResult.csv', 'w', newline='') as csvfile:
    FinalModelWriter=csv.writer(csvfile)
    FinalModelWriter.writerow(Headings)    
    writingFiles("Backward Elimination", "Random Forest", "class_weight=None, bootstrap=True, criterion='entropy', min_samples_leaf=30, min_samples_split=16, n_estimators=150", RfScore)
    writingFiles("Backward Elimination", "SVM", "kernel=poly, degree=2", SVMPolyScore)
    writingFiles("PCA", "RBF SVC", "kernel=RBF",SVMScore)