# Decision Tree
---

In [27]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.tree import export_graphviz
import pydotplus



# Decision Tree
- Decision Trees are considered one of the most mature, traditional algorithms in predictive analytics
- They are most likely used for classification problems

# Why and where we need Decision Tree ?
- When features are Categorical
# Practical Examples of Decision Tree
- You’ll take a small dataset and see if you can learn anything from it

- You’ll see if a decision tree can give you any insight as to how the eye doctor prescribes contact lenses

- You can predict the type of lenses people will use and understand the underlying processes with a decision tree

- Predict does a player plays tennis outside based on weather conditions

# Lens Dataset

# Attribute Information:

- 3 Classes

- 1 : the patient should be fitted with hard contact lenses,

- 2 : the patient should be fitted with soft contact lenses,

- 3 : the patient should not be fitted with contact lenses.

# 4 Features

- age of the patient: (1) young, (2) pre-presbyopic, (3) presbyopic

- spectacle prescription: (1) myope, (2) hypermetrope

- astigmatic: (1) no, (2) yes

- tear production rate: (1) reduced, (2) normal



In [2]:
# How to choose the root Node?

# The root and the leafs for Decision Tree are obtained based on
- Conditional Probability

- Entropy

- Information Gain

# Calcualte the entropy for fair coin.
1. We can formally quantify uncertainty measure: 
    Entropy(coin = )

In [3]:
import numpy as np
p1 = [0.5, 0.5]
p2 = [0.9, 0.1]
p3 = [0.1, 0.9]

def calculate_entropy(case):
    total = 0
    for value in case:
        total += value * np.log2(value)
    return total * -1

In [4]:
print(calculate_entropy(p1))
print(calculate_entropy(p2))
print(calculate_entropy(p3))

tennis_entropy = [5/14, 9/14]
print(calculate_entropy(tennis_entropy))

1.0
0.4689955935892812
0.4689955935892812
0.9402859586706311


In [5]:
# # Milad's version:
# def entropy(p):
#     H = np.array([-i*np.log2(i) for i in p]).sum()
#     return H
    
# p = [.5, .5]
# entropy(p)

In [6]:
PATH = '../Notebooks/Datasets/tennis.txt'
# data = pd.read_csv(PATH, delimiter="\t", header=None, names=['Day', 'Outlook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis'])

# data

Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayTennis
0,1,Sunny,Hot,High,Weak,No
1,2,Sunny,Hot,High,Strong,No
2,3,Overcast,Hot,High,Weak,Yes
3,4,Rain,Mild,High,Weak,Yes
4,5,Rain,Cool,Normal,Weak,Yes
5,6,Rain,Cool,Normal,Strong,No
6,7,Overcast,Cool,Normal,Strong,Yes
7,8,Sunny,Mild,High,Weak,No
8,9,Sunny,Cool,Normal,Weak,Yes
9,10,Rain,Mild,Normal,Weak,Yes


In [28]:
data = pd.read_csv(PATH, delimiter="\t", header=None, names=['a', 'b', 'c', 'd', 'e'])
print(data)

data_encoded = data.apply(preprocessing.LabelEncoder().fit_transform)
print(data_encoded)

#
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)

# one_hot_data = pd.get_dummies(data[['a', 'b', 'c', 'd']], drop_first=True)
# print(one_hot_data)
clf.fit(data_encoded[['a', 'b', 'c', 'd']], data_encoded['e'])


dot_data = export_graphviz(clf, out_file=None, feature_names=['Outlook', 'Temp.', 'Humidity', 'Wind'])

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('tennis_tree.png')

           a     b       c       d    e
1      Sunny   Hot    High    Weak   No
2      Sunny   Hot    High  Strong   No
3   Overcast   Hot    High    Weak  Yes
4       Rain  Mild    High    Weak  Yes
5       Rain  Cool  Normal    Weak  Yes
6       Rain  Cool  Normal  Strong   No
7   Overcast  Cool  Normal  Strong  Yes
8      Sunny  Mild    High    Weak   No
9      Sunny  Cool  Normal    Weak  Yes
10      Rain  Mild  Normal    Weak  Yes
11     Sunny  Mild  Normal  Strong  Yes
12  Overcast  Mild    High  Strong  Yes
13  Overcast   Hot  Normal    Weak  Yes
14      Rain  Mild    High  Strong   No
    a  b  c  d  e
1   2  1  0  1  0
2   2  1  0  0  0
3   0  1  0  1  1
4   1  2  0  1  1
5   1  0  1  1  1
6   1  0  1  0  0
7   0  0  1  0  1
8   2  2  0  1  0
9   2  0  1  1  1
10  1  2  1  1  1
11  2  2  1  0  1
12  0  2  0  0  1
13  0  1  1  1  1
14  1  2  0  0  0


True

In [7]:
data[data['Wind']=="Weak"]

Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayTennis
0,1,Sunny,Hot,High,Weak,No
2,3,Overcast,Hot,High,Weak,Yes
3,4,Rain,Mild,High,Weak,Yes
4,5,Rain,Cool,Normal,Weak,Yes
7,8,Sunny,Mild,High,Weak,No
8,9,Sunny,Cool,Normal,Weak,Yes
9,10,Rain,Mild,Normal,Weak,Yes
12,13,Overcast,Hot,Normal,Weak,Yes


In [8]:
# [no, yes]
wind_weak = [0.25, 0.75]

In [9]:
data[data['Wind']=="Strong"]

Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,PlayTennis
1,2,Sunny,Hot,High,Strong,No
5,6,Rain,Cool,Normal,Strong,No
6,7,Overcast,Cool,Normal,Strong,Yes
10,11,Sunny,Mild,Normal,Strong,Yes
11,12,Overcast,Mild,High,Strong,Yes
13,14,Rain,Mild,High,Strong,No


In [10]:
# [no, yes]
wind_strong = [0.5, 0.5]

In [11]:
print("entropy for playing tennis wind weak: ", calculate_entropy(wind_weak))

entropy for playing tennis wind weak:  0.8112781244591328


In [12]:
print("entropy for wind is strong: ", calculate_entropy(wind_strong))

entropy for wind is strong:  1.0


In [13]:
len(data[data["Wind"] == "Weak"])

8

In [14]:
# Information Gain = Mutaul Information

In [15]:
# TODO: Write a function to calculate Information Gain


In [16]:
# The highest information gain value will be chosen for root for the Decision Tree 

In [None]:
# look up better "Decision Tree" visualization 