In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from statistics import mean
from collections import Counter
from copy import deepcopy
import operator
from math import log2
from sklearn.model_selection import train_test_split, KFold

%matplotlib inline

In [2]:
df = pd.read_csv('ensemble_data.csv')
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
db = []

for i ,r in df.iterrows():
    db.append(r)

[type                        p
 cap_shape                   x
 cap_surface                 s
 cap_color                   n
 bruises                     t
 odor                        p
 gill_attachment             f
 gill_spacing                c
 gill_size                   n
 gill_color                  k
 stalk_shape                 e
 stalk_root                  e
 stalk_surface_above_ring    s
 stalk_surface_below_ring    s
 stalk_color_above_ring      w
 stalk_color_below_ring      w
 veil_type                   p
 veil_color                  w
 ring_number                 o
 ring_type                   p
 spore_print_color           k
 population                  s
 habitat                     u
 Name: 0, dtype: object, type                        e
 cap_shape                   x
 cap_surface                 s
 cap_color                   y
 bruises                     t
 odor                        a
 gill_attachment             f
 gill_spacing                c
 gill_size     

In [4]:
Y = df.iloc[:, 0].values
X = df
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=100)

In [5]:
header = ['Label', 'Text', 'Length', 'Unigram', 'Bigram', 'Trigram']
class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]        
        return self.value == val

    def __repr__(self):
        condition = "contains"
        return "Does %s %s %s?" % (
            "Col" + str(self.column), condition, str(self.value))

In [6]:
def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[0]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [7]:
def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

In [8]:
def info_gain(left, right, current_uncertainty, func=gini):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * func(left) - (1 - p) * func(right)

In [9]:
class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [10]:
class Decision_Node:
    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [11]:
def partition(rows, question):
    trueRows = []
    falseRows = []
    
    for r in rows:
        if question.match(r):
            trueRows.append(r)
        else:
            falseRows.append(r)
    
    return trueRows, falseRows

In [12]:
def findBestSplit(rows, questions, func):   
    best_gain = 0
    best_question = None
    current_uncertainty = func(rows)
    
    for q in questions:
        trueRows, falseRows = partition(rows, q)
        if len(trueRows) == 0 or len(falseRows) == 0:
            continue
        
        gain = info_gain(trueRows, falseRows, current_uncertainty, func)
        
        if gain >= best_gain:
            best_gain, best_question = gain, q
    
    return best_gain, best_question   

In [13]:
def formTree(rows, questions, func):
    gain, question = findBestSplit(rows, questions, func)
       
    if gain == 0:
        return Leaf(rows)
    
    trueRows, falseRows = partition(rows, question)
    questions.remove(question)
    
    trueBranch = formTree(trueRows, questions, func)
    falseBranch = formTree(falseRows, questions, func)
    
    return Decision_Node(question, trueBranch, falseBranch)

In [14]:
def classifyRow(node, row):
    if isinstance(node, Leaf):
        return node.predictions
    
    if node.question.match(row):
        return classifyRow(node.true_branch, row)
    else:
        return classifyRow(node.false_branch, row)

In [15]:
def train(data, questions, func):
    return formTree(data, deepcopy(questions), func)

In [16]:
def classify(root, rows):
    predictions = []
    for r in rows:
        predictions.append(max(classifyRow(root, r).items(), key=operator.itemgetter(1))[0])
    return predictions

In [17]:
def getDataInIndex(data, index):
    l = []
    for i in range(len(data)):
        if i in index:
            l.append(data[i])
    return l

In [18]:
def getActualLabels(act_data):
    act_labels = []
    for d in act_data:
        act_labels.append(d[0])
    return act_labels

In [19]:
def get_unique_vals(X_train, index):
    ans = []
    for i, r in X_train.iterrows():
#         print(r)
        ans.append(r[index])
    
    return set(ans)

In [20]:
questions = []

for i in range(1, X_train.shape[1]):
    unique_vals = get_unique_vals(X_train,int(i))
    for val in unique_vals:
        questions.append(Question(i,val))

In [21]:
len(questions)

117

In [33]:
db_train = X_train.values.tolist()
db_test = X_test.values.tolist()

In [34]:
kfold = KFold(5, True, 1)
precision = []
recall = []
f_score = []
i = 0

for trainInd,testInd in kfold.split(db_train):
    train_data = getDataInIndex(db_train, trainInd)
    test_data = getDataInIndex(db_train, testInd)
    
    root = train(train_data, questions, gini)
    
    prediction = classify(root, test_data)
        
    actual = getActualLabels(test_data)
    predicted = prediction
    
    precision.append(precision_score(actual, predicted, average='macro'))
    recall.append(recall_score(actual, predicted, average='macro'))
    f_score.append(f1_score(actual, predicted, average='macro'))
     
    print("Training...")

print("Precision Score = " + str(mean(precision)))
print("Recall Score = " + str(mean(recall)))
print("F Score = " + str(mean(f_score)))

Training...
Training...
Training...
Training...
Training...
Precision Score = 0.9997093023255814
Recall Score = 0.9996742671009772
F Score = 0.9996913074553999
