In [14]:
import importlib
import tree
import random_forest

importlib.reload(tree)
importlib.reload(random_forest)

<module 'random_forest' from '/home/user/uma-random-forest/src/random_forest.py'>

In [18]:
from datasets.mushrooms import MushroomDataset
from random_forest import RandomForestClassifier, TournamentRandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay,f1_score, precision_score, recall_score
import math, time
from tqdm import tqdm
import numpy as np

# Load dataset

In [7]:
path = "../data/mushroom/agaricus-lepiota.data"
dataset = MushroomDataset(path=path)
dataset.clean()
X_train, X_val, y_train, y_val = dataset.split(test_size=0.2, random_state=42)

In [8]:
n_features = round(math.sqrt(X_train.shape[1]))

# Random forest classifier

In [5]:
params_matrix = {
    "n_trees": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "max_features": [n_features],
}

In [49]:
from utils import experiments
importlib.reload(experiments)
from utils.experiments import grid_search  

n_calls = 10

best_params, score, all_results  = grid_search(params_matrix, RandomForestClassifier, X_train, X_val, y_train, y_val, n_calls)
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")

INFO:root:DecisionTreeClassifier(max_depth=3) created


INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=21, split_val=1.0, depth=2) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=4, split_val=4.25, depth=2) created
INFO:root:Node(split_feature=8, split_val=7.0, depth=1) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=0, split_val=1.25, depth=2) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=20, split_val=1.25, depth=2) created
INFO:root:Node(split_feature=10, split_val=2.25, depth=1) created
INFO:root:Node(split_feature=11, split_val=1.0, depth=0) created


Best params: {'n_trees': 100, 'max_depth': 5, 'max_features': 5}
Accuracy: 0.8516923076923076


## Wyniki

params_matrix = {
    "n_trees": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "max_features": [n_features],
}

wyniki:

Best params: {'n_trees': 100, 'max_depth': 5, 'max_features': 5}
Accuracy: 0.8516923076923076

Num of test samples: 1625
Accuracy: 0.7956923076923077
Precission: 0.7174468085106382
Recall: 1.0
F1: 0.8354806739345887


In [None]:
best_params = {'n_trees': 100, 'max_depth': 5, 'max_features': 5}

In [54]:
accuracy_arr, precision_arr, recall_arr, f1_arr = [], [], [], []
for i in tqdm(range(n_calls)):
    rf = RandomForestClassifier(**best_params)
    rf.fit(X_train, y_train)
    y_preds = [rf.predict(x) for x in X_val]

    accuracy = accuracy_score(y_val, y_preds)
    precission = precision_score(y_val, y_preds)
    recall = recall_score(y_val, y_preds)
    f1 = f1_score(y_val, y_preds)
    accuracy_arr.append(accuracy)
    precision_arr.append(precission)
    recall_arr.append(recall)
    f1_arr.append(f1)

accuracy = np.mean(accuracy_arr)
precission = np.mean(precision_arr)
recall = np.mean(recall_arr)
f1 = np.mean(f1_arr)


print("Random forest:\n")
print("Num of test samples: " + str(len(X_val)))
print(f"Accuracy: {accuracy}")
print(f"Precission: {precission}")
print(f"Recall: {recall}")
print(f"F1: {f1}")
conf_matix = ConfusionMatrixDisplay(confusion_matrix(y_val, y_preds))

INFO:root:DecisionTreeClassifier(max_depth=5) created


INFO:root:Node(split_feature=None, split_val=None, depth=5) created
INFO:root:Node(split_feature=None, split_val=None, depth=5) created
INFO:root:Node(split_feature=21, split_val=1.0, depth=4) created
INFO:root:Node(split_feature=None, split_val=None, depth=4) created
INFO:root:Node(split_feature=1, split_val=1.0, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=5) created
INFO:root:Node(split_feature=None, split_val=None, depth=5) created
INFO:root:Node(split_feature=19, split_val=4.0, depth=4) created
INFO:root:Node(split_feature=None, split_val=None, depth=4) created
INFO:root:Node(split_feature=13, split_val=1.0, depth=3) created
INFO:root:Node(split_feature=10, split_val=1.0, depth=2) created
INFO:root:Node(split_feature=None, split_val=None, depth=2) created
INFO:root:Node(split_feature=4, split_val=3.0, depth=1) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created


Random forest:

Num of test samples: 1625
Accuracy: 0.7956923076923077
Precission: 0.7174468085106382
Recall: 1.0
F1: 0.8354806739345887


# Tournament Random forest

In [12]:
params_matrix = {
    "n_trees": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "tournament_size": [3, 5, 7],
    "max_features": [n_features],
}

In [16]:
import importlib
from utils import experiments

importlib.reload(experiments)
from utils.experiments import grid_search

n_calls = 10

best_params, score, all_results = grid_search(params_matrix, TournamentRandomForestClassifier, X_train, X_val, y_train, y_val, n_calls)
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")

100%|██████████| 1/1 [01:26<00:00, 86.28s/it]

Best params: {'n_trees': 10, 'max_depth': 5, 'tournament_size': 7, 'max_features': 5}
Best score: 0.6236307692307693
All rEsults: [{'n_trees': 10, 'max_depth': 5, 'tournament_size': 7, 'max_features': 5, 'accuracy': 0.6236307692307693, 'precision': 0.584507137486008, 'recall': 1.0, 'f1': 0.7362310462544407}]





## Wyniki

All rEsults: [{'n_trees': 10, 'max_depth': 5, 'tournament_size': 7, 'max_features': 5, 'accuracy': 0.6236307692307693, 'precision': 0.584507137486008, 'recall': 1.0, 'f1': 0.7362310462544407}]


In [21]:
accuracy_arr, precision_arr, recall_arr, f1_arr = [], [], [], []
for i in tqdm(range(n_calls)):
    trf = TournamentRandomForestClassifier(**best_params)
    trf.fit(X_train, y_train)
    y_preds = [trf.predict(x) for x in X_val]

    accuracy = accuracy_score(y_val, y_preds)
    precission = precision_score(y_val, y_preds)
    recall = recall_score(y_val, y_preds)
    f1 = f1_score(y_val, y_preds)
    accuracy_arr.append(accuracy)
    precision_arr.append(precission)
    recall_arr.append(recall)
    f1_arr.append(f1)

accuracy = np.mean(accuracy_arr)
precission = np.mean(precision_arr)
recall = np.mean(recall_arr)
f1 = np.mean(f1_arr)


print("Num of test samples: " + str(len(X_val)))
print(f"Accuracy: {accuracy}")
print(f"Precission: {precission}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

100%|██████████| 10/10 [00:42<00:00,  4.28s/it]

Num of test samples: 1625
Accuracy: 0.5828923076923077
Precission: 0.5574699562731642
Recall: 1.0
F1: 0.7149059271888879



