In [1]:
import numpy as np
import preprocessing as pre
from decision_tree import DecisionTree
import pandas as pd

In [2]:
#load Training data
df = np.load('../data/fashion_train.npy') 

df = pre.preprocess(df)

X_train = df[:7000,:-1]
y_train = df[:7000,-1]
X_test = df[7000:,:-1]
y_test = df[7000:,-1]



# Find best hyperparameter

In [3]:
depths = [3, 5, 7, 9, 12]

best_HP = {

}

for depth in depths:
    dt = DecisionTree(max_depth=depth, min_samples_split=2)
    dt.fit(X_train, y_train)
    pred = dt.predict(X_test)
    acc = np.sum(pred == y_test) / len(y_test)
    print(f'Accuracy: {acc*100:.2f}%')
    best_HP[acc] = depth


Accuracy: 71.53%
Accuracy: 76.67%
Accuracy: 76.70%
Accuracy: 77.00%
Accuracy: 76.73%


In [4]:
best_hp = best_HP[max(best_HP.keys())]
print(f'Best depth: {best_hp}')

Best depth: 9


# best depth
- it was 9


# Make final model with best parameter

In [5]:
#make final model
df = np.load('../data/fashion_train.npy') 

df = pre.preprocess(df)

X_train = df[:,:-1]
y_train = df[:,-1]


dt = DecisionTree(max_depth=9, min_samples_split=2)
dt.fit(X_train, y_train)

In [6]:
#load test data
df = np.load('../data/fashion_test.npy')

df = pre.preprocess(df)

x = df[:,:-1]
y = df[:,-1]

pred = dt.predict(x)

acc = np.sum(pred == y) / len(y)

print(f'Accuracy: {acc*100:.2f}%')

Accuracy: 77.78%


In [7]:
import pickle

with open('./models/decision_tree_model.pkl', 'wb') as f:
    pickle.dump(dt, f)

# Compute the accuracy and std

In [8]:
df = np.load('../data/fashion_test.npy')

df = pre.preprocess(df)

model = pickle.load(open('models/decision_tree_model.pkl', 'rb'))

df = pd.DataFrame(df)

acc_list = []

for x in range(0, 1000):
    sample = df.sample(1000, replace=True)
    X = sample.iloc[:,:-1].to_numpy()
    y = sample.iloc[:,-1].to_numpy()
    preds = model.predict(X)
    accuracy = np.sum(preds == y) / len(y)
    acc_list.append(accuracy)

print(f'accuracy: {round(np.mean(acc_list)*100, 2)}% ± {round(np.std(acc_list)*100, 2)}%')

accuracy: 77.75% ± 1.32%


# Compare with sklearn Decision tree

# Train

In [9]:
from sklearn.tree import DecisionTreeClassifier

df = np.load('../data/fashion_train.npy') 

df = pre.preprocess(df)

X_train = df[:7000,:-1]
y_train = df[:7000,-1]
X_test = df[7000:,:-1]
y_test = df[7000:,-1]

sklearn_dt = DecisionTreeClassifier(max_depth=9, min_samples_split=2, criterion='gini', random_state=42)
sklearn_dt.fit(X_train, y_train)

pred = sklearn_dt.predict(X_test)

acc = np.sum(pred == y_test) / len(y_test)

print(f'Accuracy: {acc*100:.2f}%')


Accuracy: 76.63%


# Compute the accuracy and std for sklearn model

In [10]:
df = np.load('../data/fashion_test.npy')

df = pre.preprocess(df)

model = sklearn_dt

df = pd.DataFrame(df)

acc_list = []

for x in range(0, 1000):
    sample = df.sample(1000, replace=True)
    X = sample.iloc[:,:-1].to_numpy()
    y = sample.iloc[:,-1].to_numpy()
    preds = model.predict(X)
    accuracy = np.sum(preds == y) / len(y)
    acc_list.append(accuracy)

print(f'accuracy: {round(np.mean(acc_list)*100, 2)}% ± {round(np.std(acc_list)*100, 2)}%')

accuracy: 75.96% ± 1.31%
