## **Interpretable Classification Algorithm (Cross Val - Accuracy Build)**

#### **Import Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
class Node:
    def __init__(self):
        self.data = None
        self.left = None
        self.right = None
        
    def addValue(self,data):
        self.data = data
        
    def addLeft(self,leftNode):
        self.left = leftNode
        
    def addRight(self,rightNode):
        self.right = rightNode

In [3]:
df = pd.read_csv("wineqt.csv")
le = preprocessing.LabelEncoder()
for col_name in df.columns: 
    if(is_numeric_dtype(df[col_name]) is False):
        df[col_name] = le.fit_transform(df[col_name])
        
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,class
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [13]:
kf = KFold(n_splits=10, shuffle=True, random_state=4)
data = np.array(df)
x = data[:,:-1]
y = data[:,-1]

#### **Phase One - Complete Learning Set**

In [14]:
def phaseOne():
    
    for train_index, test_index in kf.split(x):
        x_train, x_test, y_train, y_test = x[train_index], x[test_index], y[train_index], y[test_index]
        
        P1RootNode = Node()
        P1RootNode.addValue(x_train)

        DTclf = DecisionTreeClassifier()
        DTaccuracy = (cross_val_score(DTclf, P1RootNode.data, y_train, cv=kf, scoring='accuracy')).mean()

        NBclf = GaussianNB()
        NBaccuracy = (cross_val_score(NBclf, P1RootNode.data, y_train, cv=kf, scoring='accuracy')).mean()

        KNNclf = KNeighborsClassifier(n_neighbors=3)
        KNNaccuracy = (cross_val_score(KNNclf, P1RootNode.data, y_train, cv=kf, scoring='accuracy')).mean()

        #Find model with the best accuracy
        accScoreDict = dict(Decision_Tree=DTaccuracy, Naive_Bayes=NBaccuracy, K_Nearest_Neighbour=KNNaccuracy)
        BestSingleModel = max(accScoreDict, key=accScoreDict.get)
        BestSingleModelValue = max(accScoreDict.values())

    return BestSingleModel, BestSingleModelValue

#### **Phase Two - Left Branch Split**

In [15]:
def phaseTwoLeft(col, th, P2RootNode):
    lesserThan = df.loc[df[col] < th]
    lesserThan = np.array(lesserThan)
    x = lesserThan[:,:-1]
    y = lesserThan[:,-1]
    
    accScoreDict = dict(Decision_Tree=0, Naive_Bayes=0, K_Nearest_Neighbour=0)
    
    if len(lesserThan) > 20:
        for train_index, test_index in kf.split(x):
            x_train, x_test, y_train, y_test = x[train_index], x[test_index], y[train_index], y[test_index]
            
            P2LNode = Node()
            P2LNode.addValue(x_train)
            P2RootNode.addLeft(P2LNode)

            DTclf = DecisionTreeClassifier()
            DTaccuracy = (cross_val_score(DTclf, P2RootNode.left.data, y_train, cv=kf, scoring='accuracy')).mean()

            NBclf = GaussianNB()
            NBaccuracy = (cross_val_score(NBclf, P2RootNode.left.data, y_train, cv=kf, scoring='accuracy')).mean()

            try:
                KNNclf = KNeighborsClassifier(n_neighbors=3)
                KNNaccuracy = (cross_val_score(KNNclf, P2RootNode.left.data, y_train, cv=kf, scoring='accuracy')).mean()
            except Exception as e:
                KNNaccuracy = 0

            if DTaccuracy > accScoreDict.get("Decision_Tree"):
                accScoreDict["Decision_Tree"] = DTaccuracy
            if NBaccuracy > accScoreDict.get("Naive_Bayes"):
                accScoreDict["Naive_Bayes"] = NBaccuracy
            if KNNaccuracy > accScoreDict.get("K_Nearest_Neighbour"):
                accScoreDict["K_Nearest_Neighbour"] = KNNaccuracy
    
    BestSingleModel = max(accScoreDict, key=accScoreDict.get)
    BestSingleModelValue = max(accScoreDict.values())
    
    LPercent = len(lesserThan)/len(data)
    
    return BestSingleModel, BestSingleModelValue, LPercent

#### **Phase Two - Right Branch Split**

In [16]:
def phaseTwoRight(col, th, P2RootNode):
    greaterThan = df.loc[df[col] >= th]
    greaterThan = np.array(greaterThan)
    x = greaterThan[:,:-1]
    y = greaterThan[:,-1]
    
    accScoreDict = dict(Decision_Tree=0, Naive_Bayes=0, K_Nearest_Neighbour=0)
    
    if len(greaterThan) > 20:
        for train_index, test_index in kf.split(x):
            x_train, x_test, y_train, y_test = x[train_index], x[test_index], y[train_index], y[test_index]
            
            P2RNode = Node()
            P2RNode.addValue(x_train)
            P2RootNode.addRight(P2RNode)
            
            DTclf = DecisionTreeClassifier()
            DTaccuracy = (cross_val_score(DTclf, P2RootNode.right.data, y_train, cv=kf, scoring='accuracy')).mean()
            
            NBclf = GaussianNB()
            NBaccuracy = (cross_val_score(NBclf, P2RootNode.right.data, y_train, cv=kf, scoring='accuracy')).mean()
            
            try:
                KNNclf = KNeighborsClassifier(n_neighbors=3)
                KNNaccuracy = (cross_val_score(KNNclf, P2RootNode.right.data, y_train, cv=kf, scoring='accuracy')).mean()
            except Exception as e:
                KNNaccuracy = 0
                
            if DTaccuracy > accScoreDict.get("Decision_Tree"):
                accScoreDict["Decision_Tree"] = DTaccuracy
            if NBaccuracy > accScoreDict.get("Naive_Bayes"):
                accScoreDict["Naive_Bayes"] = NBaccuracy
            if KNNaccuracy > accScoreDict.get("K_Nearest_Neighbour"):
                accScoreDict["K_Nearest_Neighbour"] = KNNaccuracy
                
    BestSingleModel = max(accScoreDict, key=accScoreDict.get)
    BestSingleModelValue = max(accScoreDict.values())
    
    RPercent = len(greaterThan)/len(data)
    
    return BestSingleModel, BestSingleModelValue, RPercent

#### **Phase One Vs Two Results**

In [17]:
bestModelDict = dict()
for col in df.drop(columns="class").columns:
    minValue = df[col].min()
    maxValue = df[col].max()
    
    P2RootNode = Node()
    P2RootNode.addValue(col)
    
    for th in np.arange(minValue, maxValue):
        if phaseTwoLeft(col, th, P2RootNode)[1] != 0 and phaseTwoRight(col, th, P2RootNode)[1] != 0:
            print(f"{col} < {th}: {phaseTwoLeft(col, th, P2RootNode)[0]}, {round(phaseTwoLeft(col, th, P2RootNode)[1],3)}")
            print(f"{col} >= {th}: {phaseTwoRight(col, th, P2RootNode)[0]}, {round(phaseTwoRight(col, th, P2RootNode)[1],3)}")
            
        leftBranch = phaseTwoLeft(col, th, P2RootNode)[0], phaseTwoLeft(col, th, P2RootNode)[1]
        rightBranch = phaseTwoRight(col, th, P2RootNode)[0], phaseTwoRight(col, th, P2RootNode)[1]
        LRWeight = phaseTwoLeft(col, th, P2RootNode)[2], phaseTwoRight(col, th, P2RootNode)[2]
        modelPair = phaseTwoLeft(col, th, P2RootNode)[0] + " & " + phaseTwoRight(col, th, P2RootNode)[0]
        value = (phaseTwoLeft(col, th, P2RootNode)[1]*phaseTwoLeft(col, th, P2RootNode)[2]) + (phaseTwoRight(col, th, P2RootNode)[1]*phaseTwoRight(col, th, P2RootNode)[2])
        modelValueList = [modelPair, value, leftBranch, rightBranch, col, th, LRWeight]
        bestModelDict[col+str(th)] = modelValueList
    print("\n")
    
bestList = []
for key in bestModelDict:
    if len(bestList) != 0:
        if bestModelDict[key][1] > bestList[1][1]:
            bestList.clear()
            bestList.append(key)
            bestList.append(bestModelDict[key])
    else:
        bestList.append(key)
        bestList.append(bestModelDict[key])
        
print(f"Model (Left): {bestList[1][4]} {bestList[1][5]}, {bestList[1][2][0]}, {round(bestList[1][2][1],3)}, {bestList[1][6][0]}")
print(f"Model (Right): {bestList[1][4]} {bestList[1][5]}, {bestList[1][3][0]}, {round(bestList[1][3][1],3)}, {bestList[1][6][1]}")

Phase1BestModel = phaseOne()
print(f"Model (Phase 1): {Phase1BestModel[0]}, {round(Phase1BestModel[1],3)}\n")

if bestList[1][1] > Phase1BestModel[1]:
    print(f"Best Model (Phase 2): {bestList[1][4]} {bestList[1][5]}, {bestList[1][0]}, {round(bestList[1][1],3)}")
else:
    print(f"Best Model (Phase 1): {Phase1BestModel}")

fixed acidity < 5.6: K_Nearest_Neighbour, 0.65
fixed acidity >= 5.6: Decision_Tree, 0.597
fixed acidity < 6.6: Naive_Bayes, 0.577
fixed acidity >= 6.6: Decision_Tree, 0.611
fixed acidity < 7.6: Decision_Tree, 0.603
fixed acidity >= 7.6: Decision_Tree, 0.585
fixed acidity < 8.6: Decision_Tree, 0.611
fixed acidity >= 8.6: Decision_Tree, 0.61
fixed acidity < 9.6: Decision_Tree, 0.609
fixed acidity >= 9.6: Decision_Tree, 0.532
fixed acidity < 10.6: Decision_Tree, 0.608
fixed acidity >= 10.6: Decision_Tree, 0.643
fixed acidity < 11.6: Decision_Tree, 0.614
fixed acidity >= 11.6: Decision_Tree, 0.679
fixed acidity < 12.6: Decision_Tree, 0.608
fixed acidity >= 12.6: Decision_Tree, 0.633






residual sugar < 1.9: Decision_Tree, 0.601
residual sugar >= 1.9: Decision_Tree, 0.602
residual sugar < 2.9: Decision_Tree, 0.61
residual sugar >= 2.9: Decision_Tree, 0.609
residual sugar < 3.8999999999999995: Decision_Tree, 0.615
residual sugar >= 3.8999999999999995: Decision_Tree, 0.581
residual sugar <

total sulfur dioxide >= 41.0: Decision_Tree, 0.655
total sulfur dioxide < 42.0: Decision_Tree, 0.558
total sulfur dioxide >= 42.0: Decision_Tree, 0.632
total sulfur dioxide < 43.0: Decision_Tree, 0.577
total sulfur dioxide >= 43.0: Decision_Tree, 0.678
total sulfur dioxide < 44.0: Decision_Tree, 0.556
total sulfur dioxide >= 44.0: Decision_Tree, 0.656
total sulfur dioxide < 45.0: Decision_Tree, 0.586
total sulfur dioxide >= 45.0: Decision_Tree, 0.637
total sulfur dioxide < 46.0: Decision_Tree, 0.576
total sulfur dioxide >= 46.0: Decision_Tree, 0.667
total sulfur dioxide < 47.0: Decision_Tree, 0.596
total sulfur dioxide >= 47.0: Decision_Tree, 0.645
total sulfur dioxide < 48.0: Decision_Tree, 0.574
total sulfur dioxide >= 48.0: Decision_Tree, 0.652
total sulfur dioxide < 49.0: Decision_Tree, 0.58
total sulfur dioxide >= 49.0: Decision_Tree, 0.663
total sulfur dioxide < 50.0: Decision_Tree, 0.599
total sulfur dioxide >= 50.0: Decision_Tree, 0.674
total sulfur dioxide < 51.0: Decision_Tre

total sulfur dioxide < 123.0: Decision_Tree, 0.579
total sulfur dioxide >= 123.0: Decision_Tree, 0.967
total sulfur dioxide < 124.0: Decision_Tree, 0.573
total sulfur dioxide >= 124.0: Decision_Tree, 1.0
total sulfur dioxide < 125.0: Decision_Tree, 0.594
total sulfur dioxide >= 125.0: Decision_Tree, 0.942
total sulfur dioxide < 126.0: Decision_Tree, 0.589
total sulfur dioxide >= 126.0: Decision_Tree, 0.933
total sulfur dioxide < 127.0: Decision_Tree, 0.585
total sulfur dioxide >= 127.0: Decision_Tree, 0.933
total sulfur dioxide < 128.0: Decision_Tree, 0.599
total sulfur dioxide >= 128.0: Decision_Tree, 0.933
total sulfur dioxide < 129.0: Decision_Tree, 0.576
total sulfur dioxide >= 129.0: Decision_Tree, 0.967
total sulfur dioxide < 130.0: Decision_Tree, 0.589
total sulfur dioxide >= 130.0: Decision_Tree, 0.933
total sulfur dioxide < 131.0: Decision_Tree, 0.59
total sulfur dioxide >= 131.0: Decision_Tree, 0.917
total sulfur dioxide < 132.0: Decision_Tree, 0.586
total sulfur dioxide >= 1

#### **Phase Three - Trial On Training Data**

In [18]:
def phaseThreeLeft(col, th, lmodel):
    lesserThan = df.loc[df[col] < th]
    lesserThan = np.array(lesserThan)
    x = lesserThan[:,:-1]
    y = lesserThan[:,-1]
    
    returnModelVal = 0
    
    if len(lesserThan) > 20:
        
        if lmodel == "Decision_Tree":
            DTclf = DecisionTreeClassifier()
            DTaccuracy = (cross_val_score(DTclf, x, y, cv=kf, scoring='accuracy')).mean()
            returnModelVal = DTaccuracy
                
        if lmodel == "Naive_Bayes":
            NBclf = GaussianNB()
            NBaccuracy = (cross_val_score(NBclf, x, y, cv=kf, scoring='accuracy')).mean()
            returnModelVal = NBaccuracy

        if lmodel == "K_Nearest_Neighbour":
            KNNclf = KNeighborsClassifier(n_neighbors=3)
            KNNaccuracy = (cross_val_score(KNNclf, x, y, cv=kf, scoring='accuracy')).mean()
            returnModelVal = KNNaccuracy

    LPercent = len(lesserThan)/len(data)
    
    return returnModelVal, LPercent

In [19]:
def phaseThreeRight(col, th, rmodel):
    greaterThan = df.loc[df[col] >= th]
    greaterThan = np.array(greaterThan)
    x = greaterThan[:,:-1]
    y = greaterThan[:,-1]
    
    returnModelVal = 0
    
    if len(greaterThan) > 20:
        
        if rmodel == "Decision_Tree":
            DTclf = DecisionTreeClassifier()
            DTaccuracy = (cross_val_score(DTclf, x, y, cv=kf, scoring='accuracy')).mean()
            returnModelVal = DTaccuracy
                
        if rmodel == "Naive_Bayes":
            NBclf = GaussianNB()
            NBaccuracy = (cross_val_score(NBclf, x, y, cv=kf, scoring='accuracy')).mean()
            returnModelVal = NBaccuracy

        if rmodel == "K_Nearest_Neighbour":
            KNNclf = KNeighborsClassifier(n_neighbors=3)
            KNNaccuracy = (cross_val_score(KNNclf, x, y, cv=kf, scoring='accuracy')).mean()
            returnModelVal = KNNaccuracy

    RPercent = len(greaterThan)/len(data)
    
    return returnModelVal, RPercent

In [20]:
lmodel = bestList[1][2][0]
rmodel = bestList[1][3][0]
col = bestList[1][4]
th = bestList[1][5]

print("Based on Phase Two")
print(f"Best Left ({col} < {th}): {round(phaseThreeLeft(col, th, lmodel)[0],3)}, {phaseThreeLeft(col, th, lmodel)[1]}")
print(f"Best Right ({col} >= {th}): {round(phaseThreeRight(col, th, rmodel)[0],3)}, {phaseThreeRight(col, th, lmodel)[1]}")

phaseThreeBest = (phaseThreeLeft(col, th, lmodel)[0] * phaseThreeLeft(col, th, lmodel)[1]) + (phaseThreeRight(col, th, lmodel)[0] * phaseThreeRight(col, th, lmodel)[1])
print(f"Overall Best ({col} {th}): {round(phaseThreeBest,3)}")

Based on Phase Two
Best Left (total sulfur dioxide < 16.0): 0.583, 0.1321084864391951
Best Right (total sulfur dioxide >= 16.0): 0.6, 0.8678915135608049
Overall Best (total sulfur dioxide 16.0): 0.599


#### **Baseline Test Comparison**

In [21]:
DTclf = DecisionTreeClassifier()
baselineDT = (cross_val_score(DTclf, x, y, cv=kf, scoring='accuracy')).mean()

NBclf = GaussianNB()
baselineNB = (cross_val_score(NBclf, x, y, cv=kf, scoring='accuracy')).mean()

KNNclf = KNeighborsClassifier(n_neighbors=3)
baselineKNN = (cross_val_score(KNNclf, x, y, cv=kf, scoring='accuracy')).mean()

print(f"Proposed Approach: {round(phaseThreeBest,3)}\nBaseline DT: {round(baselineDT,3)}\nBaseline NB: {round(baselineNB,3)}\nBaseline KNN: {round(baselineKNN,3)}")


Proposed Approach: 0.599
Baseline DT: 0.579
Baseline NB: 0.549
Baseline KNN: 0.497
