In [16]:
import numpy as np
import preprocessing as pre
from decision_tree import DecisionTree
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [10]:
#load Training data

X,y = pre.load_data("data/fashion_train.npy")
X_processed,pca = pre.preprocess(X)

# split the data
X_train, X_val, y_train, y_val = train_test_split(X_processed,y,stratify=y,test_size=0.3,random_state=42)

# Find best hyperparameter

In [21]:
# Create a Decision Tree classifier
clf = DecisionTreeClassifier()

# Define the parameter grid
param_grid = {
    'max_depth': [None, 2, 5, 6, 7, 8, 9],
    'min_samples_split': [2, 3, 5]
}

# Perform Grid Search with Cross Validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Evaluate the model with best parameters
y_pred = best_estimator.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Fitting 5 folds for each of 21 candidates, totalling 105 fits
Best Parameters: {'max_depth': 6, 'min_samples_split': 2}
Accuracy: 0.778


# Make final model with best parameter

In [None]:
model = DecisionTree(max_depth=6,min_samples_split=2)
model.fit(X_train,y_train)

# Compute the accuracy and std

In [178]:
df = np.load('../data/fashion_test.npy')

df = pre.preprocess(df)

model = pickle.load(open('models/model_decision_tree.pkl', 'rb'))

df = pd.DataFrame(df)

acc_list = []

for x in range(0, 1000):
    sample = df.sample(1000, replace=True)
    X = sample.iloc[:,:-1].to_numpy()
    y = sample.iloc[:,-1].to_numpy()
    preds = model.predict(X)
    accuracy = np.sum(preds == y) / len(y)
    acc_list.append(accuracy)

print(f'accuracy: {round(np.mean(acc_list)*100, 2)}% ± {round(np.std(acc_list)*100, 2)}%')

accuracy: 77.86% ± 1.27%


# Compare with sklearn Decision tree

# Train

In [183]:
from sklearn.tree import DecisionTreeClassifier

df = np.load('../data/fashion_train.npy') 

df = pre.preprocess(df)

X_train = df[:7000,:-1]
y_train = df[:7000,-1]
X_test = df[7000:,:-1]
y_test = df[7000:,-1]

sklearn_dt = DecisionTreeClassifier(max_depth=9, min_samples_split=2, criterion='gini', random_state=42)
sklearn_dt.fit(X_train, y_train)

pred = sklearn_dt.predict(X_test)

acc = np.sum(pred == y_test) / len(y_test)

print(f'Accuracy: {acc*100:.2f}%')


Accuracy: 77.27%


# Compute the accuracy and std for sklearn model

In [184]:
df = np.load('../data/fashion_test.npy')

df = pre.preprocess(df)

model = sklearn_dt

df = pd.DataFrame(df)

acc_list = []

for x in range(0, 1000):
    sample = df.sample(1000, replace=True)
    X = sample.iloc[:,:-1].to_numpy()
    y = sample.iloc[:,-1].to_numpy()
    preds = model.predict(X)
    accuracy = np.sum(preds == y) / len(y)
    acc_list.append(accuracy)

print(f'accuracy: {round(np.mean(acc_list)*100, 2)}% ± {round(np.std(acc_list)*100, 2)}%')

accuracy: 76.05% ± 1.36%
