In [5]:
from decisiontree import DecisionTree

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from random import random

In [6]:
import openml
with open('apikey.txt', 'r') as f:
    openml.config.apikey = f.read()

In [7]:
task = openml.tasks.get_task(9978)
data, _, _, _ = task.get_dataset().get_data()

X = data.values[:, :-1]
y = data.values[:, -1]

print(X.shape)
print(y.shape)

(2534, 72)
(2534,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42, stratify=y)

In [11]:
max_depth = int(np.sqrt(X.shape[1]))
print(max_depth)

8


In [None]:
dt = DecisionTree(max_depth=max_depth, desimbalancer=False)
dt.fit(X_train, y_train)
eval = dt.evaluate(X_test, y_test)
print(f'eval: {eval}')
dt.ppv(X_test, y_test)

In [None]:
dt = DecisionTree(max_depth=max_depth, desimbalancer=True)
dt.fit(X_train, y_train)
dt.ppv(X_test, y_test)

desimbalancer func

In [12]:
dt = DecisionTree(max_depth=max_depth, desimbalancer=False)
dt.fit(X_train, y_train)
eval = dt.evaluate(X_test, y_test)
print(f'eval: {eval}')
dt.ppv(X_test, y_test)

for f in ['linear', 'exp', 'inv_exp']:
    dt = DecisionTree(max_depth=max_depth, desimbalancer=True, desimbalancer_func=f)
    dt.fit(X_train, y_train)
    print(f, end='\n\n')
    eval = dt.evaluate(X_test, y_test)
    print(f'eval: {eval}')
    dt.ppv(X_test, y_test)
    print('-------------------------')

eval: 0.9366786140979689
1 :
	perc ds: 0.9366786140979689
	precision: 0.9961734693877551
2 :
	perc ds: 0.06332138590203107
	precision: 0.05660377358490566
linear

eval: 0.7084826762246117
1 :
	perc ds: 0.9366786140979689
	precision: 0.7117346938775511
2 :
	perc ds: 0.06332138590203107
	precision: 0.660377358490566
-------------------------
exp

eval: 0.6941457586618877
1 :
	perc ds: 0.9366786140979689
	precision: 0.6964285714285714
2 :
	perc ds: 0.06332138590203107
	precision: 0.660377358490566
-------------------------
log

eval: 0.8805256869772998
1 :
	perc ds: 0.9366786140979689
	precision: 0.9132653061224489
2 :
	perc ds: 0.06332138590203107
	precision: 0.39622641509433965
-------------------------


propositalmente desbalanceando um dataset

In [None]:
task = openml.tasks.get_task(167120)
data, _, _, _ = task.get_dataset().get_data()

X = data.values[:, :-1]
y = data.values[:, -1]

print(X.shape)
print(y.shape)

In [None]:
to_drop = list(set(y))[0]

new_data_rows = []
for _, row in data.iterrows():
    if not(row[-1] == to_drop and random() < .2):
        new_data_rows.append(row)

new_data = pd.DataFrame(new_data_rows)

In [None]:
X = new_data.values[:, :-1]
y = new_data.values[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)
max_depth = int(np.sqrt(X.shape[1]))

In [None]:
dt = DecisionTree(max_depth=max_depth, desimbalancer=False)
dt.fit(X_train, y_train)
dt.evaluate(X_test, y_test)

In [None]:
dt = DecisionTree(max_depth=max_depth, desimbalancer=True)
dt.fit(X_train, y_train)
dt.evaluate(X_test, y_test)

-- # --

In [None]:
df = pd.read_csv('datasets/iris.csv')
#df = categorize_continuous_values(df, ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'])

X = df.values[:, 1:-1]
y = df['class'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

cols = df.columns[1:-1]
dt = DecisionTree(max_depth=5)
dt.fit(X_train, y_train)

print('iris:\n')
dt.evaluate(X_test, y_test)
dt.predict(X_test)


In [None]:
df = pd.read_csv('datasets/penguins_size.csv')
df = df.dropna(axis=0)

#df = categorize_continuous_values(df, ['culmen_length_mm','culmen_depth_mm','flipper_length_mm','body_mass_g'])

X = df.values[:, 1:-1]
y = df['species'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

cols = df.columns[1:-1]
dt = DecisionTree(cols, max_depth=3)
dt.fit(X_train, y_train)

print('penguins:\n')
dt.print()
dt.evaluate(X_test, y_test)


In [None]:
import openml

with open('apikey.txt', 'r') as f:
    openml.config.apikey = f.read()

task = openml.tasks.get_task(3913)

dt = DecisionTree(max_depth=8)
run = openml.runs.run_model_on_task(dt, task)