In [1]:
from decisiontree import DecisionTree

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def categorize_continuous_values(df, cols):
    for c in cols:
        median = np.median(df[c].values)
        median_str = '{:.2f}'.format(median)
        df[c] = pd.cut(df[c], bins=[-np.inf, median, np.inf], labels=[f'<={median_str}', f'>{median_str}'])
    return df

def factorize(df, col):
    classes = set(df[col])
    class_dict = dict()
    inverse_class_dict = dict()
    for i, c in enumerate(classes):
        class_dict[c] = i
        inverse_class_dict[i] = c

    df[col] = df[col].apply(lambda x: class_dict[x])
    
    return df, inverse_class_dict

In [4]:
df = pd.read_csv('datasets/iris.csv')
#df = categorize_continuous_values(df, ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'])

X = df.values[:, 1:-1]
y = df['class'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

cols = df.columns[1:-1]
dt = DecisionTree(max_depth=5)
dt.fit(X_train, y_train)

print('iris:\n')
dt.evaluate(X_test, y_test)
dt.predict(X_test)


iris:



['Iris-versicolor',
 'Iris-setosa',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica']

In [4]:
df = pd.read_csv('datasets/penguins_size.csv')
df = df.dropna(axis=0)

#df = categorize_continuous_values(df, ['culmen_length_mm','culmen_depth_mm','flipper_length_mm','body_mass_g'])

X = df.values[:, 1:-1]
y = df['species'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

cols = df.columns[1:-1]
dt = DecisionTree(cols, max_depth=3)
dt.fit(X_train, y_train)

print('penguins:\n')
dt.print()
dt.evaluate(X_test, y_test)


penguins:

[35m<island>
[37m    Dream:
[35m        <culmen_length_mm>
[37m            >45.20: [36mChinstrap (42)
[37m            <=45.20:
[35m                <culmen_depth_mm>
[37m                    <=17.20: [36mAdelie (10)
[37m                    >17.20: [36mAdelie (33)
[37m    Biscoe:
[35m        <culmen_length_mm>
[37m            >45.20:
[35m                <culmen_depth_mm>
[37m                    <=17.20: [36mGentoo (64)
[37m                    >17.20: [36mAdelie (1)
[37m            <=45.20:
[35m                <flipper_length_mm>
[37m                    >197.00: [36mGentoo (22)
[37m                    <=197.00: [36mAdelie (23)
[37m    Torgersen: [36mAdelie (28)
[0m


0.972972972972973

In [2]:
import openml

openml.config.apikey = 'eb60f811ca0d9f4f846d59be082919f9'

task = openml.tasks.get_task(3913)

dt = DecisionTree(max_depth=8)
run = openml.runs.run_model_on_task(dt, task)

ValueError: No extension registered which can handle model: <decisiontree.DecisionTree object at 0x7f2781913a00>