In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from decision_tree import ID3 as Tree

In [2]:
data = pd.read_csv("iris.csv").dropna()
data

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
categorical_feats = list(data.columns)
for c in categorical_feats:
    data[c] = data[c].astype('category')
determiner = categorical_feats[-1]
features = categorical_feats[:-1]
print(data.dtypes)
print("Determiner Class: ", determiner)

sepallength    category
sepalwidth     category
petallength    category
petalwidth     category
class          category
dtype: object
Determiner Class:  class


In [4]:
from sklearn.model_selection import train_test_split
# Split data into features (X) and target (y)
X = data.iloc[:, :-1].to_numpy()  # Select all rows, all columns except the last one
y = data.iloc[:, -1].to_numpy()   # Select all rows, only the last column



# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tree = Tree()
tree.fit(X_train, y_train)

for i in range(len(features)):
    print("Feature: ", features[i], end="\n")
    print("Gain info: ", tree.info_gain(X, i, y), end="\n")

Feature:  sepallength
Gain info:  0.8769376208910578
Feature:  sepalwidth
Gain info:  0.5108699641236061
Feature:  petallength
Gain info:  1.4463165236458
Feature:  petalwidth
Gain info:  1.4358978386754417


In [5]:
def print_tree(node, feature_names, depth=0):
    indent = "  " * depth
    if node.value is not None:
        print(f"{indent}Leaf: Class {node.value}")
    else:
        feature_name = feature_names[node.feature_index]
        print(f"{indent}Feature '{feature_name}':")
        for param, child in node.children:
            print(f"{indent}  If '{feature_name}' == {param}:")
            print_tree(child, feature_names, depth + 2)

In [6]:
print_tree(tree.root, features)

Feature 'petallength':
  If 'petallength' == 1.0:
    Leaf: Class Iris-setosa
  If 'petallength' == 1.1:
    Leaf: Class Iris-setosa
  If 'petallength' == 1.2:
    Leaf: Class Iris-setosa
  If 'petallength' == 1.3:
    Leaf: Class Iris-setosa
  If 'petallength' == 1.4:
    Leaf: Class Iris-setosa
  If 'petallength' == 1.5:
    Leaf: Class Iris-setosa
  If 'petallength' == 1.6:
    Leaf: Class Iris-setosa
  If 'petallength' == 1.7:
    Leaf: Class Iris-setosa
  If 'petallength' == 1.9:
    Leaf: Class Iris-setosa
  If 'petallength' == 3.0:
    Leaf: Class Iris-versicolor
  If 'petallength' == 3.3:
    Leaf: Class Iris-versicolor
  If 'petallength' == 3.5:
    Leaf: Class Iris-versicolor
  If 'petallength' == 3.7:
    Leaf: Class Iris-versicolor
  If 'petallength' == 3.8:
    Leaf: Class Iris-versicolor
  If 'petallength' == 3.9:
    Leaf: Class Iris-versicolor
  If 'petallength' == 4.0:
    Leaf: Class Iris-versicolor
  If 'petallength' == 4.1:
    Leaf: Class Iris-versicolor
  If 'peta

In [7]:
y_pred = tree.predict(X_test)

In [8]:
def accuracy(y_true, y_pred):
    correct_predictions = np.sum(y_true == y_pred)
    total_predictions = len(y_true)
    return correct_predictions / total_predictions

In [9]:
accuracy(y_test, y_pred)

0.9333333333333333

In [10]:
data = pd.read_csv("BostonHousing.csv").dropna()
data['indus'] = pd.to_numeric(data['indus'], errors='coerce')
# Split data into features (X) and target (y)
X = data.iloc[:, :-1].to_numpy()  # Select all rows, all columns except the last one
y = data.iloc[:, -1].to_numpy()   # Select all rows, only the last column


# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tree = Tree(mode="reg")
tree.fit(X_train, y_train)

<decision_tree.ID3 at 0x155d19b90>

In [11]:
y_pred = tree.predict(X_test)

In [13]:
def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def r_squared(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)

def evaluate_regression(y_true, y_pred):
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'MSE': mean_squared_error(y_true, y_pred),
        'RMSE': root_mean_squared_error(y_true, y_pred),
        'R-squared': r_squared(y_true, y_pred)
    }

In [14]:
evaluate_regression(y_test, y_pred)

{'MAE': 6.221707638279194,
 'MSE': 74.55434651260816,
 'RMSE': 8.63448588583062,
 'R-squared': -0.016643943618956714}

In [40]:
class GradientBoosting:
    def __init__(self, n_estimators, learning_rate, n_classes):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.n_classes = n_classes

        self.trees = [[Tree() for _ in range(n_classes)] for _ in range(n_estimators)]

    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        # Initialize predictions with zeros
        F = np.zeros((n_samples, self.n_classes))
        
        for m in range(self.n_estimators):
            # Compute gradients
            p = self.softmax(F)
            for k in range(self.n_classes):
                grad = p[:, k] - (y == k).astype(int)
                
                # Fit a regression tree to negative gradients
                self.trees[m][k].fit(X, -grad)
                
                # Update F with predictions
                F[:, k] += self.learning_rate * self.trees[m][k].predict(X)
        
        return self

    def predict_proba(self, X):
        n_samples = X.shape[0]
        F = np.zeros((n_samples, self.n_classes))
        
        for m in range(self.n_estimators):
            for k in range(self.n_classes):
                F[:, k] += self.learning_rate * self.trees[m][k].predict(X)
        
        return self.softmax(F)

    def predict(self, X):
        probas = self.predict_proba(X)
        return np.argmax(probas, axis=1)

    def softmax(self, x):
        """
        Compute softmax values for each set of scores in x.
        
        Parameters:
        x (array-like): Input array of shape (n_samples, n_features)
        
        Returns:
        array-like: Softmax values of shape (n_samples, n_features)
        """
        e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return e_x / np.sum(e_x, axis=1, keepdims=True)


    def cross_entropy_loss(y_true, y_pred):
        """
        Compute the cross-entropy loss for multiclass classification.
        
        Parameters:
        y_true (array-like): True labels (1D array of class indices)
        y_pred (array-like): Predicted probabilities (2D array, shape (n_samples, n_classes))
        
        Returns:
        float: The average cross-entropy loss
        """
        n_samples = len(y_true)
        
        # Clip predicted probabilities to avoid log(0)
        eps = 1e-15
        y_pred = np.clip(y_pred, eps, 1 - eps)
        
        # Convert y_true to one-hot encoding
        y_true_one_hot = np.zeros_like(y_pred)
        y_true_one_hot[np.arange(n_samples), y_true] = 1
        
        # Compute cross-entropy loss
        loss = -np.sum(y_true_one_hot * np.log(y_pred)) / n_samples
        
        return loss

In [43]:
gbt = GradientBoosting(100, 0.5, 2)
gbt.fit(X_train, y_train)
# Get probability predictions
y_pred_proba = gbt.predict_proba(X_test)

# Get class predictions
y_pred = gbt.predict(X_test)


In [44]:
y_pred

array([0, 0, 0])

In [16]:
import pydot
from IPython.display import Image, display

In [17]:
def draw(parent_name, child_name):
    edge = pydot.Edge(parent_name, child_name)
    graph.add_edge(edge)

def visit(node, parent=None):
    for k,v in node.items():
        if isinstance(v, dict):
            if parent:
                draw(parent, k)
            visit(v, k)
        else:
            draw(parent, k)
            draw(k, k+"_"+v)

graph = pydot.Dot(graph_type='graph')
visit(dtree)
tree_plot = Image(graph.create_png())
display(tree_plot)

NameError: name 'dtree' is not defined