# BT2101 Tutorial 1: Decision Tree

### Import required libraries

In [100]:
## Import required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt, log
from __future__ import division
from collections import defaultdict
%matplotlib inline

## Questions 1 & 2

### Function to calculate entropy with 2 classes

In [101]:
def entropy_two_only(sample_labels):
    '''Takes in labels for one attribute. E.g. if we are splitting by gender, input would be vactor for males
    1) sample_labels: Labels for samples in the current tree node, such as (1, 0, 0, 1, 0) or (1, -1, -1, 1, 0)
    Outputs:
    1) entropy: Entropy value of labels in the current tree node.       
    '''
    
    # Assert np.array
    sample_labels = np.array(sample_labels)
    
    # What if sample_labels are empty
    if sample_labels.size == 0:
        return 0  
    
    # What if all the labels are the same
    class_values = np.unique(sample_labels) # Sample labels/classes; Usually (0,1), sometimes (-1,1)
    num0 = len(list(filter(lambda x:x==class_values[0], sample_labels))) # Number of samples with one label
    num1 = len(list(filter(lambda x:x==class_values[1], sample_labels))) if class_values.size > 1 else 0 # Number of samples with another label
    
    if sample_labels.size == num0 or sample_labels.size == num1:
        return 0
    
    # Calculate entropy value      
    p0 = num0 / (num0+num1) # Probability of class 0 labels
    p1 = 1 - p0 # Probability of class 1 labels
    
    entropy = -(p0*log(p0,2) + p1*log(p1,2))    
    
    return entropy

### General function to calculate entropy

In [102]:
def entropy(sample_labels):
    """Input: A vector of sample labels e.g. [A,B,A,C,C,C,A,B,B,A,C,B,A,A]. 
       This input has to be for a particular segment of the attribute we are splitting on.
       e.g. if splitting on gender attribute, the vector will be the labels for all males. OR all females. etc   
       Output: A number between 0 and 1, the entropy"""
    
    # Assert that it it a numpy array
    sample_labels = np.array(sample_labels)
    
    # Return 0 if empty array
    if sample_labels.size == 0:
        return 0
    
    class_values = np.unique(sample_labels)
    entropy = 0  # initialize entropy
    
    # iterate over every class and get the proportion. From there, add to entropy using formula.
    for label in class_values:
        number_of_instances = len(list(filter(lambda x:x==label, sample_labels)))
        if number_of_instances == 0:   # if no instances of this class, continue
            continue
            
        proportion = number_of_instances/len(sample_labels)
        entropy -= proportion*log(proportion,2)
    
    return entropy

### Manually calculate entropy for genders (experimental section - can ignore)

In [103]:
### Read and prepare data
df = pd.read_csv('titanic_final.csv')
#df["Survived"] = pd.Categorical(df["Survived"]).codes   # converts labels into numerical categories

overall_entropy = entropy(df["Survived"])  # overall entropy of dataset

### GENDER
#split by gender (attribute)
male = df[df["Gender"] == "Male"]
female = df[df["Gender"] == "Female"]

# get the entropies
entropy_male = entropy(male["Survived"])
entropy_female = entropy(female["Survived"])
weighted_entropy = (entropy_male*len(male) + entropy_female*len(female))/len(df)


### Function to calculate information gain

In [104]:
def calc_info_gain(df, selected_feature, label, method):
    """df: Takes in the whole dataframe
       Selected Feature: Name of the column which we want to split on
       Label: Name of column which contains the labels
       method: The function we are using. eg entropy or gini_index. These are in fact functions.
       This function outputs a tuple (weighted_entropy, info_gain)"""

    initial_measure = method(df[label])    # calculate overall entropy/ gini-index of dataframe
    attribute_segments = df[selected_feature].unique()   # stores a list of all segments within chosen attribute eg.[male, female]
    
    weighted_measure = 0   # initialise weighted entropy/ gini-index
    
    for segment in attribute_segments:
        data_for_segment = df[df[selected_feature] == segment]
        labels_for_segment = data_for_segment[label]
        measure_for_this = method(labels_for_segment)
        weighted_measure += measure_for_this * len(data_for_segment)
    
    weighted_measure /= len(df)
    info_gain = initial_measure - weighted_measure
    
    return (weighted_measure, info_gain)


    

### Calculate information gain for all attributes using entropy

In [105]:
df = pd.read_csv("titanic_final.csv")
attributes = df.columns[1:-1]

# print out the weighted entropy and info gain for each attribute.
for attribute in attributes:
    print(attribute, ":", calc_info_gain(df, attribute, df.columns[-1], entropy))

Age Class : (0.8675277182987636, 0.10342287615590495)
Passenger Class : (0.8172018848211989, 0.15374870963346965)
Gender : (0.5619639695247292, 0.4089866249299394)
No of Siblings or Spouses on Board : (0.7834701673945229, 0.18748042706014567)
No of Parents or Children on Board : (0.9654839523229165, 0.005466642131752075)


## Question 3

### General function to calculate Gini Index

In [106]:
def gini_index(sample_labels):
    """Input: A vector of sample labels e.g. [A,B,A,C,C,C,A,B,B,A,C,B,A,A]. 
       This input has to be for a particular segment of the attribute we are splitting on.
       e.g. if splitting on gender attribute, the vector will be the labels for all males. OR all females. etc   
       Output: The gini index for this split"""
    
    # Assert that it it a numpy array
    sample_labels = np.array(sample_labels)
    
    # Return 0 if empty array
    if sample_labels.size == 0:
        return 0
    
    class_values = np.unique(sample_labels)
    intermediate_sum = 0  # initialize 1 - (gini index)
    
    # iterate over every class and get the proportion. From there, add to entropy using formula.
    for label in class_values:
        number_of_instances = len(list(filter(lambda x:x==label, sample_labels)))
            
        proportion = number_of_instances/len(sample_labels)
        intermediate_sum += proportion**2
    
    return 1 - intermediate_sum

### Calculate information gain for all attributes using Gini Index

In [107]:
df = pd.read_csv("titanic_final.csv")
attributes = df.columns[1:-1]

# print out the weighted entropy and info gain for each attribute.
for attribute in attributes:
    print(attribute, ":", calc_info_gain(df, attribute, df.columns[-1], gini_index))

Age Class : (0.4277777777777778, 0.05222222222222217)
Passenger Class : (0.38055555555555554, 0.09944444444444445)
Gender : (0.22962962962962957, 0.2503703703703704)
No of Siblings or Spouses on Board : (0.3576252723311547, 0.12237472766884527)
No of Parents or Children on Board : (0.4763285024154589, 0.0036714975845411058)
