# Code to be used to classify data
# using a decision tree algorithm
# called ID3

## Andrea Khoury

Warning: the data set needs to be cleaned as follows.
(1) Two identical set of values cannot have a different label. If that happens in the data, choose the label that happens more oftern.
(2) All the values must be as expected. No blanks or spelling errors.

The dataset called "weather.csv" is the same data as
in the ID3 tutorial provided.

In [1]:
import pandas as pd
import numpy as np
import math

In [7]:
df1=pd.read_csv("weather-train.csv") # This is the data from the ID3 tutorial

In [8]:
testdata=pd.read_csv('weather-test.csv')

In [9]:
#These are the names of the features. 
#The [:-1] at the end removes the "play" column.
df1.columns[:-1] 

Index(['Outlook', 'temperature', 'humidity', 'windy'], dtype='object')

In [12]:
#This is the second data set in df1(df1 is the training set )
df1.iloc[1] 

Outlook        sunny
temperature      hot
humidity        high
windy           True
play              no
Name: 1, dtype: object

In [13]:
df1["Outlook"] #These are the values for the column "outlook" in df1

0        sunny
1        sunny
2     overcast
3        rainy
4        rainy
5        rainy
6     overcast
7        sunny
8        sunny
9        rainy
10       sunny
11    overcast
12    overcast
13       rainy
Name: Outlook, dtype: object

In [14]:
# Entropy function for a numpy array of labels
def H(dataf):
    a, b = np.unique(dataf.values.T[-1], return_counts=True)
    #print([a,b])
    c=-sum([(i/sum(b))*math.log(i/sum(b),2) for i in b])
    return c

In [15]:
H(df1) # Entropy for the labels

0.9402859586706309

In [16]:
# Information gain for the attributes. 
# The argument "dataf" is assumed to be the dataframe 
# of attributes and labels. The label is "play"
def Ha(dataf): 
    label="play"
    n=len(dataf)
    T=H(dataf)
    if H(dataf)==0:
        return [dataf[label].values[0],"leaf-node"]
    max=[" ",0]
    # Each column is an attribute below
    for column in dataf.columns.values[:-1]:
        #print(column)
        ent=0
        for entry in np.unique(dataf[column]):
            #print(" ",entry)
            ent+=(len(dataf.loc[dataf[column]==entry])/n)*(H(dataf.loc[dataf[column]==entry]))
        if max[1]<(T-ent):
            max=[column,T-ent]
# Now we print the attribute with highest gain and its values
    return [max[0],np.unique(dataf[max[0]]).tolist()]

In [17]:
### I WILL USE THE DATA IN ONE ROW TO TEST THE ALGORITHM
nrow=testdata.loc[3][:-1]
print(nrow)
print(" ")
print(nrow.values)
print(" ")
print("nrow['Outlook']=",nrow[0])

Outlook        overcast
temperature        cool
humidity         normal
Name: 3, dtype: object
 
['overcast' 'cool' 'normal']
 
nrow['Outlook']= overcast


In [18]:
out= Ha(df1) # This reveals the next step in the tree
print("The attribute with highest information gain is: ")
print("out= ",out)
print(" ")
print("out[0]= ",out[0])
print(" ")
print("out[1]= ", out[1])

The attribute with highest information gain is: 
out=  ['Outlook', ['overcast', 'rainy', 'sunny']]
 
out[0]=  Outlook
 
out[1]=  ['overcast', 'rainy', 'sunny']


### The attribute above is the next node in the decision tree
The list of its unique values determines what goes next.
When classifying a row of values from the training set,
we select the branch corresponding to it.

In [19]:
dataentry=nrow[out[0]]
df2=df1.loc[df1[out[0]]==dataentry]
#df2=df2.drop(["Outlook"],axis=1)
df2

Unnamed: 0,Outlook,temperature,humidity,windy,play
2,overcast,hot,high,False,yes
6,overcast,cool,normal,True,yes
11,overcast,mild,high,True,yes
12,overcast,hot,normal,False,yes


In [20]:
out=Ha(df2)
print(out)

['yes', 'leaf-node']


In [21]:
dataentry=nrow[out[0]]
df3=df2.loc[df1[out[0]]==dataentry]
#df3=df3.drop(["Outlook"],axis=1)
print(df3)
out=Ha(df3)
print(out)

KeyError: 'yes'

In [22]:
#Train and test are pandas dataframes
def classify(train,test):
    #print(len(test))
    testlabel=[]
    for i in range(len(test)):
        #print(test.loc[i])
        #print(Ha(train))
        #print(" ")
        traincp=train
        out=Ha(traincp)
        while out[1]!="leaf-node":
            print(out[0])
            print(" ")
            dataentry=test.loc[i][out[0]]
            print("testdata:: ",test.loc[i])
            print(" ")
            print("testdata:: ",dataentry)
            newtrain=traincp.loc[train[out[0]]==dataentry]
            print(newtrain)
            out=Ha(newtrain)
            traincp=newtrain
        #print("out[0] ",out[0])
        testlabel.append(out[0])
        #print("testlabel", testlabel)
        print("----------------------")
    print(testlabel)
    test['label(play)']=np.array(testlabel)
    return test

In [23]:
testdata

Unnamed: 0,Outlook,temperature,humidity,windy
0,overcast,mild,normal,False
1,rainy,hot,high,True
2,rainy,cool,normal,False
3,overcast,cool,normal,True
4,rainy,mild,high,False
5,sunny,hot,normal,True
6,rainy,cool,normal,True
7,sunny,cool,high,False
8,sunny,hot,high,True
9,overcast,hot,high,False


In [24]:
classify(df1,testdata)

Outlook
 
testdata::  Outlook        overcast
temperature        mild
humidity         normal
windy             False
Name: 0, dtype: object
 
testdata::  overcast
     Outlook temperature humidity  windy play
2   overcast         hot     high  False  yes
6   overcast        cool   normal   True  yes
11  overcast        mild     high   True  yes
12  overcast         hot   normal  False  yes
----------------------
Outlook
 
testdata::  Outlook        rainy
temperature      hot
humidity        high
windy           True
Name: 1, dtype: object
 
testdata::  rainy
   Outlook temperature humidity  windy play
3    rainy        mild     high  False  yes
4    rainy        cool   normal  False  yes
5    rainy        cool   normal   True   no
9    rainy        mild   normal  False  yes
13   rainy        mild     high   True   no
windy
 
testdata::  Outlook        rainy
temperature      hot
humidity        high
windy           True
Name: 1, dtype: object
 
testdata::  True
   Outlook temperature h

windy
 
testdata::  Outlook         rainy
temperature      cool
humidity       normal
windy           False
Name: 13, dtype: object
 
testdata::  False
  Outlook temperature humidity  windy play
3   rainy        mild     high  False  yes
4   rainy        cool   normal  False  yes
9   rainy        mild   normal  False  yes
----------------------
Outlook
 
testdata::  Outlook         sunny
temperature      cool
humidity       normal
windy           False
Name: 14, dtype: object
 
testdata::  sunny
   Outlook temperature humidity  windy play
0    sunny         hot     high  False   no
1    sunny         hot     high   True   no
7    sunny        mild     high  False   no
8    sunny        cool   normal  False  yes
10   sunny        mild   normal   True  yes
humidity
 
testdata::  Outlook         sunny
temperature      cool
humidity       normal
windy           False
Name: 14, dtype: object
 
testdata::  normal
   Outlook temperature humidity  windy play
8    sunny        cool   normal  Fa

Unnamed: 0,Outlook,temperature,humidity,windy,label(play)
0,overcast,mild,normal,False,yes
1,rainy,hot,high,True,no
2,rainy,cool,normal,False,yes
3,overcast,cool,normal,True,yes
4,rainy,mild,high,False,yes
5,sunny,hot,normal,True,yes
6,rainy,cool,normal,True,no
7,sunny,cool,high,False,no
8,sunny,hot,high,True,no
9,overcast,hot,high,False,yes
