In [23]:
import numpy as nm
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import math
from collections import Counter

In [24]:
df=pd.read_csv("Iris.csv")
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [25]:
df.shape

(150, 6)

In [26]:
print(df.isnull().sum())

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [27]:
num_col=df.select_dtypes(include=['number']).columns
for i in num_col:
    mean=df[i].mean()
    df[i]=(df[i]>mean).astype(int)        

In [28]:
print(df)

     Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0     0              0             1              0             0   
1     0              0             0              0             0   
2     0              0             1              0             0   
3     0              0             1              0             0   
4     0              0             1              0             0   
..   ..            ...           ...            ...           ...   
145   1              1             0              1             1   
146   1              1             0              1             1   
147   1              1             0              1             1   
148   1              1             1              1             1   
149   1              1             0              1             1   

            Species  
0       Iris-setosa  
1       Iris-setosa  
2       Iris-setosa  
3       Iris-setosa  
4       Iris-setosa  
..              ...  
145  Iris-virgini

In [29]:
X=df.drop(['Species'],axis=1)
Y=df['Species']
print(X)

     Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0     0              0             1              0             0
1     0              0             0              0             0
2     0              0             1              0             0
3     0              0             1              0             0
4     0              0             1              0             0
..   ..            ...           ...            ...           ...
145   1              1             0              1             1
146   1              1             0              1             1
147   1              1             0              1             1
148   1              1             1              1             1
149   1              1             0              1             1

[150 rows x 5 columns]


In [30]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [31]:
print(X_train)


     Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
22    0              0             1              0             0
15    0              0             1              0             0
65    0              1             1              1             1
11    0              0             1              0             0
42    0              0             1              0             0
..   ..            ...           ...            ...           ...
71    0              1             0              1             1
106   1              0             0              1             1
14    0              0             1              0             0
92    1              0             0              1             1
102   1              1             0              1             1

[120 rows x 5 columns]


In [32]:
def entropy(data, target_attr):
    values = [row[target_attr] for row in data]
    counts = Counter(values)
    total = len(values)
    return -sum((count / total) * math.log2(count / total) for count in counts.values())

In [33]:
def information_gain(data, target_attr, attr):
    total_entropy = entropy(data, target_attr)
    values = set(row[attr] for row in data)
    subset_entropy = 0

    for value in values:
        subset = [row for row in data if row[attr] == value]
        weight = len(subset) / len(data)
        subset_entropy += weight * entropy(subset, target_attr)

    return total_entropy - subset_entropy

In [34]:
def build_decision_tree(data, target_attr, attributes):
    target_values = [row[target_attr] for row in data]
    if len(set(target_values)) == 1:
        return target_values[0]
    if not attributes:
        return Counter(target_values).most_common(1)[0][0]

    
    best_attr = max(attributes, key=lambda attr: information_gain(data, target_attr, attr))
    tree = {best_attr: {}}

    for value in set(row[best_attr] for row in data):
        subset = [row for row in data if row[best_attr] == value]
        if not subset:
            tree[best_attr][value] = Counter(target_values).most_common(1)[0][0]
        else:
            remaining_attrs = [attr for attr in attributes if attr != best_attr]
            tree[best_attr][value] = build_decision_tree(subset, target_attr, remaining_attrs)

    return tree

In [35]:
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    attribute = next(iter(tree))
    value = instance.get(attribute)
    subtree = tree.get(attribute, {}).get(value, None)
    if subtree is None:
        return None  
    return predict(subtree, instance)

In [36]:
def calculate_accuracy(tree, test_data, target_attr):
    correct_predictions = sum(
        1 for instance in test_data if predict(tree, instance) == instance[target_attr]
    )
    return correct_predictions / len(test_data)

In [37]:
train_data = train_data.to_dict(orient="records")
test_data = test_data.to_dict(orient="records")

In [38]:
attributes = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]
target = 'Species'

In [39]:
decision_tree = build_decision_tree(train_data, target, attributes)

In [40]:
print("Decision Tree:", decision_tree)

Decision Tree: {'PetalLengthCm': {0: {'SepalWidthCm': {0: {'SepalLengthCm': {0: {'PetalWidthCm': {0: 'Iris-versicolor'}}}}, 1: 'Iris-setosa'}}, 1: {'SepalLengthCm': {0: {'PetalWidthCm': {0: 'Iris-versicolor', 1: {'SepalWidthCm': {0: 'Iris-versicolor'}}}}, 1: {'PetalWidthCm': {0: 'Iris-versicolor', 1: {'SepalWidthCm': {0: 'Iris-virginica', 1: 'Iris-virginica'}}}}}}}}


In [41]:
accuracy = calculate_accuracy(decision_tree, test_data, target)
print(f"Accuracy on test data: {accuracy * 100:.2f}%")

Accuracy on test data: 73.33%
