In [4]:
from decisiontree import DecisionTree

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
def categorize_continuous_values(df, cols):
    for c in cols:
        median = np.median(df[c].values)
        median_str = '{:.2f}'.format(median)
        df[c] = pd.cut(df[c], bins=[-np.inf, median, np.inf], labels=[f'<={median_str}', f'>{median_str}'])
    return df

def factorize(df, col):
    classes = set(df[col])
    class_dict = dict()
    inverse_class_dict = dict()
    for i, c in enumerate(classes):
        class_dict[c] = i
        inverse_class_dict[i] = c

    df[col] = df[col].apply(lambda x: class_dict[x])
    
    return df, inverse_class_dict

In [6]:
df = pd.read_csv('datasets/iris.csv')
df = categorize_continuous_values(df, ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'])

X = df.values[:, 1:-1]
y = df['class'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

cols = df.columns[1:-1]
dt = DecisionTree(cols, max_depth=5)
dt.fit(X_train, y_train)

print('iris:\n')
dt.print()
dt.evaluate(X_test, y_test)


iris:

[35m<petallength>
[37m    <=4.35:
[35m        <sepalwidth>
[37m            >3.00: [36mIris-setosa (25)
[37m            <=3.00:
[35m                <sepallength>
[37m                    <=5.80:
[35m                        <petalwidth>
[37m                            >1.30: [36mIris-versicolor (16)
[37m                            <=1.30: [36mIris-versicolor (16)
[37m                    >5.80: [36mIris-versicolor (5)
[37m    >4.35:
[35m        <sepalwidth>
[37m            >3.00:
[35m                <sepallength>
[37m                    >5.80:
[35m                        <petalwidth>
[37m                            >1.30: [36mIris-virginica (11)
[37m            <=3.00:
[35m                <sepallength>
[37m                    <=5.80:
[35m                        <petalwidth>
[37m                            >1.30: [36mIris-virginica (5)
[37m                            <=1.30: [36mIris-virginica (5)
[37m                    >5.80:
[35m                   

IndexError: index 2 is out of bounds for axis 0 with size 1

In [3]:
df = pd.read_csv('datasets/penguins_size.csv')
df = df.dropna(axis=0)

df = categorize_continuous_values(df, ['culmen_length_mm','culmen_depth_mm','flipper_length_mm','body_mass_g'])

X = df.values[:, 1:-1]
y = df['species'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

cols = df.columns[1:-1]
dt = DecisionTree(cols)
dt.fit(X_train, y_train)

print('penguins:\n')
dt.print()
dt.evaluate(X_test, y_test)


penguins:

[35m<island>
[37m    Torgersen: [36mAdelie (28)
[37m    Biscoe:
[35m        <culmen_length_mm>
[37m            >44.50: [36mGentoo (72)
[37m            <=44.50: [36mAdelie (38)
[37m    Dream:
[35m        <culmen_length_mm>
[37m            >44.50: [36mChinstrap (44)
[37m            <=44.50: [36mAdelie (41)
[0m


IndexError: index 0 is out of bounds for axis 0 with size 0