In [1]:
import numpy as np
import preprocessing as pre
from decision_tree import DecisionTree
from sklearn.model_selection import train_test_split
import pandas as pd

In [15]:
x, y = pre.load_data('../data/fashion_train.npy')

x_processed, pca = pre.preprocess(x)

X_train, X_test, y_train, y_test = train_test_split(x_processed, y, test_size=0.2, random_state=42, stratify=y)


In [16]:
import pickle


with open('./models/pca_65.pkl', 'wb') as f:
    pickle.dump(pca, f)

# Find best hyperparameter

In [7]:
#grindsearch

from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier

max_depth = [3, 5, 7, 9, 11]

min_samples_split = [2, 3, 4, 5]


param_grid = dict(max_depth=max_depth, min_samples_split=min_samples_split)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)

print(
    "The best parameters are %s with a score of %0.4f"
    % (grid.best_params_, grid.best_score_)
)





The best parameters are {'max_depth': 7, 'min_samples_split': 5} with a score of 0.7814


# Make final model with best parameter

In [13]:
dt = DecisionTree(max_depth=7, min_samples_split=5)
dt.fit(X_train, y_train)

In [4]:
#load test data

pred = dt.predict(X_test)

acc = np.sum(pred == y_test) / len(y_test)

print(f'Accuracy: {acc*100:.2f}%')

Accuracy: 78.55%


In [5]:
import pickle

with open('./models/decision_tree_model.pkl', 'wb') as f:
    pickle.dump(dt, f)

# Compute the accuracy and std

In [19]:
x, y = pre.load_data('../data/fashion_test.npy')

pca_65 = pickle.load(open('./models/pca_65.pkl', 'rb'))

x_processed = pre.preprocess(x, pca=pca_65)

model = pickle.load(open('models/decision_tree_model.pkl', 'rb'))

df = pd.DataFrame(x_processed)
df['y'] = y

acc_list = []

for x in range(0, 1000):
    sample = df.sample(1000, replace=True)
    X = sample.iloc[:,:-1].to_numpy()
    y = sample.iloc[:,-1].to_numpy()
    preds = model.predict(X)
    accuracy = np.sum(preds == y) / len(y)
    acc_list.append(accuracy)

print(f'accuracy: {round(np.mean(acc_list)*100, 2)}% ± {round(np.std(acc_list)*100, 2)}%')

accuracy: 76.93% ± 1.33%


# Compare with sklearn Decision tree

# Train

In [20]:
from sklearn.tree import DecisionTreeClassifier



sklearn_dt = DecisionTreeClassifier(max_depth=7, min_samples_split=5, criterion='gini', random_state=42)
sklearn_dt.fit(X_train, y_train)

pred = sklearn_dt.predict(X_test)

acc = np.sum(pred == y_test) / len(y_test)

print(f'Accuracy: {acc*100:.2f}%')


Accuracy: 78.50%


# Compute the accuracy and std for sklearn model

In [21]:
x, y = pre.load_data('../data/fashion_test.npy')

pca_65 = pickle.load(open('./models/pca_65.pkl', 'rb'))

x_processed = pre.preprocess(x, pca=pca_65)

model = sklearn_dt

df = pd.DataFrame(df)
df['y'] = y

acc_list = []

for x in range(0, 1000):
    sample = df.sample(1000, replace=True)
    X = sample.iloc[:,:-1].to_numpy()
    y = sample.iloc[:,-1].to_numpy()
    preds = model.predict(X)
    accuracy = np.sum(preds == y) / len(y)
    acc_list.append(accuracy)

print(f'accuracy: {round(np.mean(acc_list)*100, 2)}% ± {round(np.std(acc_list)*100, 2)}%')

accuracy: 77.03% ± 1.32%
