In [1]:
import csv
import numpy as np
import math
import pandas as pd

In [2]:
def read_data(filename):
    with open(filename, 'r') as csvfile:
        datareader = csv.reader(csvfile, delimiter=',')
        headers = next(datareader)
        metadata = []
        traindata = []
        for name in headers:
            metadata.append(name)
        for row in datareader:
            traindata.append(row)
    return (metadata, traindata)


In [3]:
class Node:
    def __init__(self, attribute):
        self.attribute = attribute
        self.children = []
        self.answer = "" #answer is the resultant classification

    def __str__(self):
        return self.attribute

In [4]:
def subtables(data, col, delete):
    dict = {}
    items = np.unique(data[:, col]) 
    #find the classes in that column of data into items[]
    #now items is a 1-D array hence items[0] contains no. of rows each representing an unique class
    count = np.zeros((items.shape[0], 1), dtype=np.int32)    
    #make a matrix of zeroes
    
    for x in range(items.shape[0]):
        #for every class in the column
        for y in range(data.shape[0]):
            #for example in the data 
            if data[y, col] == items[x]:
                #count the no. of examples representing each class in that column
                count[x] += 1
                
    for x in range(items.shape[0]):
        #for every class 
        dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32") # "S32" means array containing string
        #np.empty(shape tuple, datatype) create an array specified shape in the default of that datatype
        #shape = (int(count[x]), data.shape[1]) is (integer(no. of examples for that class) , total_cols in data)
        pos = 0
        for y in range(data.shape[0]):
            #for every example in data
            if data[y, col] == items[x]:
                #if the class of that example in that attribut(col) matches 
                dict[items[x]][pos] = data[y]
                #overwrite the empty matrix's row value of that key in dictionary with whole example
                pos += 1    
                #traverse to the next row in the matrix
        if delete:
            #delete was a parameter from the calling method set to default false
            #this case executes only if ....
            dict[items[x]] = np.delete(dict[items[x]], col, 1)
            #np.delete(array , col , axis) returns the result after removing either a row(axis = 0) or a column(axis=1)
            #in this case an entire column is delete for it is processed
            #the result of delete operation is updated as value in the dictionary
            
            #finally return the list of classes and the associated dictionary
    return items, dict

In [5]:
def entropy(S):
    items = np.unique(S)


    if items.size == 1:
        return 0
    counts = np.zeros((items.shape[0], 1))
    sums = 0
    for x in range(items.shape[0]):
        counts[x] = sum(S == items[x]) / (S.size * 1.0)
    for count in counts:
        sums += -1 * count * math.log(count, 2)
    return sums

In [6]:
def gain_ratio(data, col):
    items, dict = subtables(data, col, delete=False) #first find the unique classes for that col hence make a subtable
    #items = list of unique classes in that columns
    #dict is a dictionary of key,value where key is a class and value is the matrix of example representing the key at that column
    
    total_size = data.shape[0] #total no. of examples
    entropies = np.zeros((items.shape[0], 1))
    #creating a zeroes matrix as [[0],[0],[0],[0],...n] shape = list of n arrays coantaining only one element 0
    #created to represent entropy for every class
    intrinsic = np.zeros((items.shape[0], 1))
    #created to represent intinsic for every class
    
    for x in range(items.shape[0]):
        #for every class in that col
        ratio = dict[items[x]].shape[0]/(total_size * 1.0) #no of examples representing that class in the col / no. of examples in data
        entropies[x] = ratio * entropy(dict[items[x]][:, -1]) #multiply ratio to every row of entropy 2-D array except for last col
                                                              #entropy() user defined method return the entropy value
        intrinsic[x] = ratio * math.log(ratio, 2) 

    total_entropy = entropy(data[:, -1]) #assign the resultant entropy of all the classes
    iv = -1 * sum(intrinsic) #summation of all values in a 1-D array
    for x in range(entropies.shape[0]):
        #for every class subtract the actual entropy form the total
        total_entropy -= entropies[x] #Note: total was a sum of ratio*entropy 
        #total = sum(ratio * entropy) - entropy
        
    #finally return the profitablility score for the attribute(i.e. col)
    return total_entropy / iv

In [7]:
def create_node(data, metadata):
        #unique values in the last column of the matrix 
        #representing 2-D matrix data[every row][last] as data[:,-1]
    if (np.unique(data[:, -1])).shape[0] == 1:
        #case represents only a constant value for the attribute throughout the experienced dataset
        node = Node("")#Node's attribute currently null string
        node.answer = np.unique(data[:, -1])[0]#overwriting the answer attribute with the only class value hence [0]
        return node 
    
    gains = np.zeros((data.shape[1] - 1, 1))
    #shape is an attribute of that particular datatype that returns a tuple(rows,columns)
    #tuple[1] extracts the value at index 1
    #np.zeros fills the marix of shape mentioned in tuple with zeroes
    
    for col in range(data.shape[1] - 1):
        #range is from 0 to (mentioned_no - 1)
        gains[col] = gain_ratio(data, col)
        #to find out the most profitable columns in the table
        #gain_ratio returned the profitablility score for that particular attribute(i.e. col)
        
    split = np.argmax(gains)
    # return the index( in this case the col pos) of the max value 1-D array. https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html

    node = Node(metadata[split])    
    #Create a new node with column name of the max entropy i.e. variable split
    metadata = np.delete(metadata, split, 0)
    #delete the column of max entropy i.e. split which just been processed. The remainder array is overwritten into metadata
    
    items, dict = subtables(data, split, delete=True)
    #now retrieve all the classes of the most profitabel column 
    for x in range(items.shape[0]):
        #every class is appended as child
        child = create_node(dict[items[x]], metadata) #child node is created
        node.children.append((items[x], child)) #append the child node into the children list of the most profitable col node
        
    #finally return the node with its children
    return node

In [8]:
def empty(size):
    s = ""
    for x in range(size):
        s += "   " # this is mostly for user friendly spacing for clarity in tree structuring
    return s

In [9]:
def print_tree(node, level):
    if node.answer != "":
        #if the node's classification is null string
        print(empty(level), node.answer)
        #conclude that the decision tree is empty
        return
    
    print(empty(level), node.attribute)
    #with printing every level's attribute of the tree, empty the level 
    for value, n in node.children:
        #for every attribute and node of the child
        print(empty(level + 1), value)
        print_tree(n, level + 2)

In [10]:
metadata, traindata = read_data("tennis.data")
print(metadata)
print("\n",traindata,"\n")
data = np.array(traindata)

node = create_node(data, metadata)
#the most profitable col node is crreated appended to its children
print_tree(node, 0)
#now print the total tree with the most profitable column as a root

['outlook', 'temperature', 'humidity', 'wind', 'answer']

 [['sunny', 'hot', 'high', 'weak', 'no'], ['sunny', 'hot', 'high', 'strong', 'no'], ['overcast', 'hot', 'high', 'weak', 'yes'], ['rain', 'mild', 'high', 'weak', 'yes'], ['rain', 'cool', 'normal', 'weak', 'yes'], ['rain', 'cool', 'normal', 'strong', 'no'], ['overcast', 'cool', 'normal', 'strong', 'yes'], ['sunny', 'mild', 'high', 'weak', 'no'], ['sunny', 'cool', 'normal', 'weak', 'yes'], ['rain', 'mild', 'normal', 'weak', 'yes'], ['sunny', 'mild', 'normal', 'strong', 'yes'], ['overcast', 'mild', 'high', 'strong', 'yes'], ['overcast', 'hot', 'normal', 'weak', 'yes'], ['rain', 'mild', 'high', 'strong', 'no']] 

 outlook
    overcast
       b'yes'
    rain
       wind
          b'strong'
             b'no'
          b'weak'
             b'yes'
    sunny
       humidity
          b'high'
             b'no'
          b'normal'
             b'yes'


In [11]:
data

array([['sunny', 'hot', 'high', 'weak', 'no'],
       ['sunny', 'hot', 'high', 'strong', 'no'],
       ['overcast', 'hot', 'high', 'weak', 'yes'],
       ['rain', 'mild', 'high', 'weak', 'yes'],
       ['rain', 'cool', 'normal', 'weak', 'yes'],
       ['rain', 'cool', 'normal', 'strong', 'no'],
       ['overcast', 'cool', 'normal', 'strong', 'yes'],
       ['sunny', 'mild', 'high', 'weak', 'no'],
       ['sunny', 'cool', 'normal', 'weak', 'yes'],
       ['rain', 'mild', 'normal', 'weak', 'yes'],
       ['sunny', 'mild', 'normal', 'strong', 'yes'],
       ['overcast', 'mild', 'high', 'strong', 'yes'],
       ['overcast', 'hot', 'normal', 'weak', 'yes'],
       ['rain', 'mild', 'high', 'strong', 'no']], dtype='<U8')

In [12]:
print("Numpy Unique",np.unique(data[:, -1]))

Numpy Unique ['no' 'yes']


In [13]:
type(data) , data.shape

(numpy.ndarray, (14, 5))

shape is an attribute of the datatype

In [14]:
np.unique(data[1]).shape

(5,)