# Assignment 2 - Building a decision tree

This is a skeleton of a decision tree classifier for the example data set in `data/example.csv`.

In [51]:
import csv
import math
from statistics import median, mode, mean
from collections import Counter
from enum import Enum
import numpy as np
import scipy.stats as st


The input filename is hard-coded.

In [52]:
trainingSet = np.loadtxt('data/housing_price_train.csv',dtype='<U20',delimiter=',')
testSet = np.loadtxt('data/housing_price_test.csv',dtype='<U20',delimiter=',')

The attribute labels types are hard-coded too (the same order as in the file!).

In [53]:
test_attributes = testSet[0]
trainingTarget= np.transpose(np.copy(trainingSet[1:]))
testData = np.transpose(np.copy(testSet[1:]))
testData = np.copy(testData)
test_attr_names =np.copy(test_attributes)

The index of the target attribute (assuming it's the last).

In [54]:
IDX_TARGET = np.log10(np.float64(trainingTarget[-1]))

A main class DT representing the decision tree classifier. It could represent with methods:

  - a given impurity measure;
  - the search for the best attribute to split with;
  - the addition of a node to the tree;
  - a convenient model printer;
  - the recursive call for obtaining a tree;
  - a builder and an applier.

In [55]:
def stringToInt(attribute):
    ''' 
    Convert numeric attributes from string to int
    '''
    uniq_vals, val_counts = np.unique(attribute,return_counts=True)
    map_ints = range(len(uniq_vals))
    map_attr = np.copy(attribute)
    for i in range(len(uniq_vals)):
        all_val = np.where(attribute == uniq_vals[i])[0]
        map_attr[all_val] = map_ints[i]
        intvals = map_ints[i]
        allints = np.ones_like(all_val)*intvals
    return [uniq_vals, map_ints, val_counts,map_attr]
def probabilities(attribute):
    ''' 
    Probabilities for entropy calculation
    '''
    uniq_vals, val_counts=np.unique(attribute,return_counts=True)
    val_counts = np.float64(val_counts)
    val_probs = val_counts/np.float64(attribute.size)
    return uniq_vals, val_probs
def entropy(attribute):
    ''' 
    Compute entropy, which is used to compute infogain
    '''
    uniq_vals , val_probs =probabilities(attribute)
    return np.sum(val_probs *np.log2(1./val_probs))
def infogain(attribute,labels):
    ''' 
    Compute infogains
    '''
    cts_arr =probabilities(attribute)
    label_entropy = entropy(labels)
    label_given_attr = []
    if type(cts_arr[1]) != np.ndarray:
        return label_entropy
    cts_dict = dict(zip(cts_arr[0],cts_arr[1]))
    for uniq_attr_val,uniq_val_prob in cts_dict.items():
        label_given_attr.append( uniq_val_prob *entropy(labels[attribute==uniq_attr_val]))
    return label_entropy- np.sum(label_given_attr)
def bestAttr(data, attribute_names,labels):
    ''' 
    the best attribute is the one that gives maximum infogain
    '''
    gains=np.array([])
    for i in data:
        gains=np.append( gains, infogain(i, labels))
    maxind = np.where(gains == np.max(gains))[0][0]
    maxgn = gains[maxind]
    att_name= attribute_names[maxind]
    print('Max Gain:', maxgn)
    return att_name,maxind




class DT:
    def __init__(self, attr_split=None,parent=None):
        self.attr_split= attr_split
        self.parent=parent
        self.child=[]
        
    def add_child(self,child):
        ''' 
        Add child node
        '''
        self.child.append(child)
        child.parent= self
        
    def __mean_squared_error(self, records):
        """
        Calculates mean squared error for a selection of records.

        :param records: Data records (given by indices)
        """
        # TODO
        predictions = self.predict(records)
        sqe = np.zeros(len(records)) #square error
        i = 0
        for rec in records:
            sqe[i] = (self.data[records,IDX_TARGET]-predictions[records, IDX_TARGET])**2 
            i = i+1
        MSE = mean(sqe) #mean square error
        return MSE
    
    
def pureAttr(vals):
    ''' 
    Attribute with a purity of 1
    '''
    return len(np.unique(vals))==1

def isFinite(arr1,arr2):
    ''' 
    Check if input arrays consist of finite numbers
    '''
    finds = np.where((~np.isnan(arr1)) &(~np.isnan(arr2)))[0]
    fin_arr1 = arr1[finds]
    fin_arr2 = arr2[finds]
    return fin_arr1,fin_arr2,finds

def split(attr,labels,attr_name,depth=0):
    ''' 
    Split attribute data
    '''
    try:
        attr_dat =np.float64(attr)
        uniq_vals,probs = probabilities(attr)
        typ='num'    
    except:
        uniq_vals,map_ints, val_counts, map_attr = stringToInt(attr)
        attr_dat = np.copy(np.float64(map_attr))
        typ='nom'
        print('Map Attr',map_attr)
    if typ =='nom' or len(uniq_vals) <=10:
        n_splits = len(uniq_vals)
        splits =[]
        split_ints = []
        for i in range(n_splits):
            splits.append([attr==uniq_vals[i] ])
            split_ints.append(uniq_vals[i])
        threshold_val = np.array([-99])
        threshold_type ='NA'
    elif len(uniq_vals) >2:
        try:
            labels = np.float64(labels)
            sort_by_attr = np.argsort(attr_dat)
            attr_sort = np.copy(attr_dat[sort_by_attr])
            label_sort =np.copy(labels[sort_by_attr])
            linearRegslopes_left = []
            linearRegslopes_right = []
            for i in range(len(attr_sort)//10,len(attr_sort) -len(attr_sort)//10  ):
                attr_l = attr_sort[0:i]
                attr_r = attr_sort[i:]
                lab_l = label_sort[0:i]
                lab_r = label_sort[i:]
                m_l,b_l = np.polyfit(attr_l,lab_l,1)
                m_r,b_r = np.polyfit(attr_r,lab_r,1)
                linearRegslopes_left.append(abs(m_l))
                linearRegslopes_right.append(abs(m_r))
            linearRegslopes_left =np.array(linearRegslopes_left)
            linearRegslopes_right =np.array(linearRegslopes_right)
            finite_left,finite_right,finite_inds = is_finite(linearRegslopes_left,linearRegslopes_right)
            inv_left = 1./finite_left
            inv_right = 1./finite_right
            linearLeft=finite_left*inv_right
            linearRight = inv_left*finite_right
            finmxleft = np.where(linearLeft == np.max(linearLeft))[0][0]
            mxleft = np.where(linearRegslopes_left == finite_left[finmxleft])[0]
            finmxright = np.where(linearRight == np.max(linearRight))[0][0]
            mxright = np.where(linearRegslopes_right == finite_right[finmxright])[0]
            threshold_val = attr_sort[mxleft+len(attr_sort)//10]
                   
            if typ=='nom':
                split_l = [attr_dat <threshold_val]
                split_r = [attr_dat >=threshold_val]
                splits=[split_l,split_r]
            else:
                split_l = [attr_dat<=threshold_val]
                split_r= [attr_dat >threshold_val]
                splits=[split_l,split_r]
            threshold_type='linsplit'
            split_ints =[]
        except:
            threshold_val = np.array([np.mean(attr_dat)])
            split_l =[attr_dat < threshold_val]
            split_r = [attr_dat >= threshold_val]
            splits = [split_l,split_r]
            threshold_type = 'linsplit'
            split_ints = []
    return splits,threshold_val,threshold_type,split_ints,typ

def print_tree(tree):
    '''
    Print split info
    '''
    print(tree.split_attr)
    print(tree.threshold_val)
    print(tree.avg_label)

def buildTree(dat,attribute_names,labels,root=None,depth=0):
    '''
    Find best attribute to split on, the values for each splits and build a decision tree by calling this function recursively.
    '''
    if dat[0] ==[]:
        root.avg_label = root.parent.avg_label
        return 
    if len(dat[0]) <5:
        root.avg_label= np.mean(labels)
        return
    bestattr,maxind= bestAttr(dat,attribute_names,labels)
    if not root:
        root = DT()
    root.split_attr =bestattr
    root.split_attr_ind=maxind
    attr_data = np.copy(dat[maxind])
    if pureAttr(attr_data):
        root.avg_label= np.mean(labels)
        return 
    splits,threshold_val, threshold_type,split_ints,typ=split(attr_data,labels,bestattr,depth=depth)
    trees = []
    for i in range(len(splits)):
        trees.append(Tree(parent=root))
        root.add_child(trees[i])
        root.split_ints = split_ints
        trees[i].inds_filt = splits[i][0]
        
    root.threshold_val =threshold_val
    root.threshold_type = threshold_type
    root.typ = typ
    root.avg_label= np.mean(labels)
    depth+=1
    red= [attribute_names!=bestattr]

    red_attr_names = attribute_names[red]
    red_data = dat[red]
    for tree in trees:
        buildTree(red_data[:,tree.inds_filt],red_attr_names,labels[tree.inds_filt],root=tree,depth=depth)
    return root 
targetId = {}
tree = buildTree(data,attr_names, IDX_TARGET)


def predictlabels(dat,attr_names,dTree,predTree = None,labels=[]):
    '''
    Predict target/class values
    '''
    if len(dat[0]) == 0:
        return
    if len(labels) == 0:
        labels = np.ones_like(dat[0])
    if len(dTree.child) ==0:
        for tid in dat[0]:
            targetId[tid] = dTree.avg_label 
        return
    if not predTree:
        predTree = Tree()
    attr_split = dTree.split_attr 
    attr_split_ind =np.where(attr_names == attr_split)[0]
    attr_split_ints = dTree.split_ints
    attr_dat = dat[attr_split_ind][0] 
    thresh_type = dTree.threshold_type
    thresh_val = np.float64(dTree.threshold_val[0])
    try:
        attr_data =np.float64(attr_dat)
        uniq_vals,probs = probabilities(attr_dat)
    except:
        uniq_vals,map_ints, val_counts, map_attr = stringToInt(attr_dat)
        attr_data = np.copy(np.float64(map_attr))
    typ = dTree.typ
    predTree.attr_split = attr_split
    if thresh_type=='0':
        split_l = np.where(attr_data == 0)[0]
        split_r = np.where(attr_data > 0)[0]
        splits = [split_l,split_r]
    elif thresh_type=='linsplit':
        if typ=='nom':
            split_l = np.where(attr_data <thresh_val)[0]
            split_r = np.where(attr_data >=thresh_val)[0]
            splits=[split_l,split_r]
        else:
            split_l = np.where(attr_data<=thresh_val)[0]
            split_r= np.where(attr_data >thresh_val)[0]       
            splits=[split_l,split_r]   
    elif thresh_type == 'NA':
        splits =[]
        for val in attr_split_ints:
            splits.append(np.where(attr_data==val)[0])    
    trees = []
    for i in range(len(splits)):
        trees.append(Tree(parent=predTree))
        predTree.add_child(trees[i])
        trees[i].label_inds = splits[i]
        trees[i].dtree = dTree.child[i]
        trees[i].avg_label =dTree.avg_label
        predictlabels(dat[:,trees[i].label_inds],attr_names,trees[i].dtree, predTree= trees[i],labels=labels)
    tids = dat[0]
    labs =[]
    for tid in tids:
        try:
            labs.append(targetId[tid])
        except:
            labs.append(np.median(list(targetId.values() )))
    return labs
p = predictlabels(testData, test_attr_names,tree)




Max Gain: 8.045796138342293




Max Gain: 7.446101489756081
Max Gain: 6.524561605715065
Max Gain: 6.497662618424865
Max Gain: 5.65645677569764
Max Gain: 5.631615665225583
Max Gain: 5.8574319218048085
Max Gain: 6.595891163108825
Max Gain: 7.149981774453494
Max Gain: 7.43752996644357
Max Gain: 7.331770310806017
Max Gain: 7.21647843135994
Max Gain: 7.813184524595746
Max Gain: 7.126460214904276
Max Gain: 6.247495789386386
Max Gain: 5.94770277922009
Max Gain: 6.058984089445426
Max Gain: 6.87496155606346
Max Gain: 7.164965687925738
Max Gain: 6.457972024550777
Max Gain: 6.11249200111032
Max Gain: 5.304682449772211
Max Gain: 4.566108939837479
Max Gain: 4.436605434317882
Max Gain: 3.321928094887362
Max Gain: 3.7004397181410926
Max Gain: 6.364489555653822
Max Gain: 5.720049960644811
Max Gain: 5.155399311574898


In [56]:
import pandas as pd
def createSubmission(test_ids, predictions):
    sub = pd.DataFrame()
    sub['Id'] = test_ids
    sub['SalePrice'] = predictions
    sub.to_csv('submission.csv',index=False)

Finally, the main function building a decision tree model, printing it and applying it on some unseen records.

In [57]:
def main():
    
    test_ids = []
    predictions = []
    for i in range(len(testData[0])):
        test_ids.append(str(testData[0][i]))
        predictions.append(str(10**p[i]))
    
    createSubmission(test_ids, predictions)
    
    print_tree(dT)

if __name__ == "__main__":
    main()

LotArea
[13860.]
5.221978956475628


### 