In [189]:
from math import *
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#### Common functions

In [190]:
def calculateEntropy(dataset, targetAttribute):
    values = list(set(dataset[targetAttribute]))
    sum = 0
    for value in values:
        fraction = len(dataset[dataset[targetAttribute] == value])/len(dataset)
        entropy = fraction * log2(fraction)
        sum += entropy
    return -sum

In [191]:
def calculateInformationGain(dataset, attribute, targetAttribute):
    datasetEntropy = calculateEntropy(dataset, targetAttribute)
    attributeValues = list(set(dataset[targetAttribute]))
    sum = 0
    for value in attributeValues:
        fraction = len(dataset[dataset[attribute] == value])//len(dataset)
        E = fraction * entropy(dataset[dataset[attribute] == value], targetAttribute)
        sum += E
    return datasetEntropy - sum

In [192]:
def findBestFeature(dataset, features, target):
    best = None
    maxInfoGain = 0
    for f in features:
        infoGain = calculateInformationGain(dataset, f, target)
        if infoGain > maxInfoGain:
            maxInfoGain = infoGain
            best = f
    return best

In [193]:
def removeAttribute(attributes, attribute):
    newAttributes = []
    for x in attributes:
        if x != attribute:
            newAttributes.append(x)
    return newAttributes

def mostCommonValue(dataset, attribute):
    values = list(set(dataset[attribute]))
    mostCommon = None
    maxCount = 0
    for value in values:
        count = len(dataset[dataset[attribute] == value])
        
        if count > maxCount:
            maxCount = count
            mostCommon = value
    return mostCommon

In [194]:
def buildDecisionTree(dataset, attributes, target, depth):

    values = list(set(dataset[target]))
    if len(values) == 1:
        return values[0]
    if len(attributes) == 0 or depth > 2:
        return mostCommonValue(dataset, target)
    
    bestAttribute = findBestFeature(dataset, attributes, target)
    tree = {bestAttribute:{}}
    
    for value in list(set(dataset[bestAttribute])):
        
        newAttributes = removeAttribute(attributes, bestAttribute)
        subtree = buildDecisionTree(dataset[dataset[bestAttribute] == value], newAttributes, target, depth+1)
        tree[bestAttribute][value] = subtree
        
    return tree

In [195]:
def printTree(tree, depth):
    if type(tree) is dict:
        for attribute in tree.keys():
            print('|' + ' '*depth + str(attribute))
            printTree(tree[attribute], depth+1)
    else:
        print('|' + ' '*depth + tree)

In [196]:
def predict(example, tree):
    if type(tree) is not dict:
        return tree
    attribute = list(tree.keys())[0]
    branch = tree[attribute]
    for key in branch.keys():
        if example[attribute] == key:
            return predict(example, branch[key])

In [197]:
def calculateAccuracy(dataset, tree, target):
    accurate = 0
    for index, row in dataset.iterrows():
        if predict(row, tree) == row[target]:
            accurate += 1
    return accurate/len(dataset)*100

In [198]:
def reducedErrorPrune(tree, minError):
    if isinstance(tree, dict):
        for attribute in list(tree.keys()):
            for value in list(tree[attribute].keys()):
                subtree = tree[attribute][value]
                reducedErrorPrune(subtree, minError)
        if len(tree) == 1:
            return tree
        else:
            subtree_errors = [reducedErrorPrune(subtree) for subtree in list(trevalues())]
            if min(subtree_errors) < minError:
                return min(subtree_errors)
            else:
                return tree
    else:
        return tree

prunedTree = reducedErrorPrune(DT, 0.5)
calculateAccuracy(test, prunedTree, 4)

40.0

#### 1. Consider Iris dataset and implement Decission tree and perform pruning as needed.

In [199]:
iris = pd.read_csv("iris.data", header=None)
iris.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [200]:
"""
  0 - 'sepal length (cm)',
  1 - 'sepal width (cm)',
  2 - 'petal length (cm)',
  3 - 'petal width (cm)'
  
"""
features = ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)',
    'target']

In [201]:
# preprocessing
iris.dropna()


for col in iris.columns:
    iris = iris[iris[col]!='?']

attributes = list(iris.columns)
attributes.remove(4)

iris.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [202]:
train, test = train_test_split(iris, test_size=0.2)
DT = buildDecisionTree(train, attributes, 4, 0)
printTree(DT, 0)

|0
| 4.6
|  Iris-setosa
| 5.5
|  1
|   2.4
|    Iris-versicolor
|   2.5
|    Iris-versicolor
|   4.2
|    Iris-setosa
|   2.6
|    Iris-versicolor
|   2.3
|    Iris-versicolor
|   3.5
|    Iris-setosa
| 6.8
|  Iris-versicolor
| 6.6
|  Iris-versicolor
| 5.0
|  1
|   2.3
|    Iris-versicolor
|   3.5
|    Iris-setosa
|   3.0
|    Iris-setosa
|   3.4
|    Iris-setosa
|   3.2
|    Iris-setosa
|   3.6
|    Iris-setosa
| 6.1
|  Iris-versicolor
| 7.2
|  Iris-virginica
| 6.5
|  1
|   3.2
|    Iris-virginica
|   2.8
|    Iris-versicolor
|   3.0
|    Iris-virginica
| 5.1
|  1
|   3.8
|    Iris-setosa
|   2.5
|    Iris-versicolor
|   3.5
|    Iris-setosa
|   3.4
|    Iris-setosa
| 6.4
|  1
|   2.9
|    Iris-versicolor
|   3.2
|    2
|     4.5
|      Iris-versicolor
|     5.3
|      Iris-virginica
|   2.8
|    Iris-virginica
|   3.1
|    Iris-virginica
|   2.7
|    Iris-virginica
| 5.6
|  1
|   2.9
|    Iris-versicolor
|   2.5
|    Iris-versicolor
|   3.0
|    Iris-versicolor
|   2.8
|    Iris-virg

In [203]:
print("Accuracy:", calculateAccuracy(test, DT, 4))

Accuracy: 30.0


#### 2. Considering the weather dataset, implement Decission tree and perform pruning as needed.

In [204]:
weather = pd.read_csv("weather.csv")
weather.head()

Unnamed: 0,Date,TempHighF,TempAvgF,TempLowF,DewPointHighF,DewPointAvgF,DewPointLowF,HumidityHighPercent,HumidityAvgPercent,HumidityLowPercent,...,SeaLevelPressureAvgInches,SeaLevelPressureLowInches,VisibilityHighMiles,VisibilityAvgMiles,VisibilityLowMiles,WindHighMPH,WindAvgMPH,WindGustMPH,PrecipitationSumInches,Events
0,2013-12-21,74,60,45,67,49,43,93,75,57,...,29.68,29.59,10,7,2,20,4,31,0.46,"Rain , Thunderstorm"
1,2013-12-22,56,48,39,43,36,28,93,68,43,...,30.13,29.87,10,10,5,16,6,25,0,
2,2013-12-23,58,45,32,31,27,23,76,52,27,...,30.49,30.41,10,10,10,8,3,12,0,
3,2013-12-24,61,46,31,36,28,21,89,56,22,...,30.45,30.3,10,10,7,12,4,20,0,
4,2013-12-25,58,50,41,44,40,36,86,71,56,...,30.33,30.27,10,10,7,10,2,16,T,


In [205]:
results = 'PrecipitationSumInches'
features = ['TempAvgF', 'DewPointAvgF', 'HumidityAvgPercent', 'SeaLevelPressureAvgInches', 'VisibilityAvgMiles', 'WindAvgMPH']
for col in weather.columns:
    if col not in results and col not in features:
        weather.drop(col, axis=1, inplace=True)

weather = weather[weather[results]!="T"]
for col in features:
    weather = weather[weather[col]!="-"]

attributes = list(weather.columns)
attributes.remove('PrecipitationSumInches')

weather.head()

Unnamed: 0,TempAvgF,DewPointAvgF,HumidityAvgPercent,SeaLevelPressureAvgInches,VisibilityAvgMiles,WindAvgMPH,PrecipitationSumInches
0,60,49,75,29.68,7,4,0.46
1,48,36,68,30.13,10,6,0.0
2,45,27,52,30.49,10,3,0.0
3,46,28,56,30.45,10,4,0.0
5,48,36,63,30.4,9,3,0.0


In [209]:
weatherTrain, weatherTest = train_test_split(weather, test_size=0.2)
weatherDT = buildDecisionTree(weatherTrain, attributes, 'PrecipitationSumInches', 0)
printTree(weatherDT, 0)

|TempAvgF
| 29
|  0
| 32
|  0
| 33
|  0.05
| 34
|  DewPointAvgF
|   31
|    HumidityAvgPercent
|     93
|      0.1
|     76
|      0.3
|   29
|    HumidityAvgPercent
|     87
|      0.01
|     84
|      0
|   12
|    0
|   24
|    0
| 35
|  DewPointAvgF
|   29
|    0.1
|   11
|    0
| 36
|  DewPointAvgF
|   26
|    0.02
|   20
|    0.08
| 37
|  DewPointAvgF
|   18
|    0
|   35
|    0.63
| 38
|  0
| 39
|  DewPointAvgF
|   25
|    0.13
|   38
|    0.09
|   36
|    0
| 40
|  DewPointAvgF
|   30
|    0.13
|   26
|    0
|   20
|    0
|   33
|    0
| 41
|  DewPointAvgF
|   39
|    0.2
|   24
|    0
|   27
|    0
|   23
|    0
| 42
|  DewPointAvgF
|   37
|    0.26
|   26
|    0
|   39
|    HumidityAvgPercent
|     91
|      0.25
|     90
|      0.17
|   23
|    0
|   45
|    0.01
|   29
|    0.06
|   28
|    0
| 43
|  DewPointAvgF
|   40
|    0
|   30
|    0
|   36
|    0.03
|   23
|    0
| 44
|  DewPointAvgF
|   31
|    0
|   30
|    0
|   33
|    0.11
|   39
|    0.02
|   23
|    0
|   37


In [207]:
print("Accuracy:", calculateAccuracy(weatherTest, weatherDT, 'PrecipitationSumInches'))

Accuracy: 40.92827004219409
