In [1]:
import sys
sys.path.append('..\\models')

import pandas as pd
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from matplotlib import pyplot as plt
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
import numpy as np
# from tqdm import tqdm
from tqdm.notebook import tqdm
# from tqdm.contrib.itertools import product
from itertools import product

from ID3 import ID3
from NBC_Categorical import NBC_Categorical
from RandomForestClf import RandomForestClf
from datasets import get_airline_dataset, get_exams_dataset, get_ecommerce_dataset


In [2]:
def cross_validation_score(X, y, model, n_splits=5):
    kf =KFold(n_splits=5, shuffle=True, random_state=42)
    # split()  method generate indices to split data into training and test set.
    scores = []
    for train_index, test_index in kf.split(X, y):
        train_X, train_y = X.loc[train_index,:], y.loc[train_index]
        test_X, test_y = X.loc[test_index, :], y.loc[test_index]
        model.fit(train_X, train_y)
        scores.append(model.score(test_X, test_y))
    
    score = round(sum(scores)/len(scores), 4)
    return score

def test_accuracy(X, y, model):
    return model.score(X, y)


def get_conf_matrix(y_test, y_pred, labels):
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()
    plt.show()

# Plan eksperymentów

Dla każdego zbioru danych:

Budowa lasu losowego z parametrami:
- jaki procent liczby atrybutów jest losowany: 25%, 50%, 75%
- jaki procent przykładów jest losowany: 25%, 50%, 75%
- proporcje liczby ID3 do NBC: 25:75, 50:50, 75:25, 100:0
- liczba klasyfikatorow: 64, 96, 128

Czyli razem 3\*3\*3\*4\*3=324 doswiadczeń powtórzonych 25 razy i uśrednionych

Dla najlepszego znalezionego modelu rysujemy confusion matrix

# Experiment template

In [4]:
class RandomForrest:
    def __init__(self) -> None:
        pass

    def score(self):
        return 12.3

In [11]:
experiment_repetitions = 25
dataset_loadDataset_valMethod = [("exams", get_exams_dataset), ("e-commerce", get_ecommerce_dataset), ("airline", get_airline_dataset)] # [("exams", get_exams_dataset)] 
model_param_attribute_part = [0.25, 0.5, 0.75]
model_param_instances_per_classifier = [0.25, 0.5, 0.75]
model_param_id3_to_NBC = [(25, 75), (50, 50), (75, 25), (100, 0)]
model_param_num_of_classifiers = [64, 96, 128]

In [8]:
models_results_df = pd.DataFrame(columns=["model", "accuracy", "dataset", "attribute part", "instances per classifier", "id3 to NBC", "num of classifiers"])

# Parameter Loops
for dataset_name, load_func in dataset_loadDataset_valMethod:
    print("DATASET:", dataset_name)
    if dataset_name=="airline":
        (X_train, y_train), (X_test, y_test) = get_airline_dataset("train"), get_airline_dataset("test")
    else:
        X, y = load_func()
        
    params_prod = list(product(model_param_attribute_part, 
                               model_param_instances_per_classifier, 
                               model_param_id3_to_NBC, 
                               model_param_num_of_classifiers))
    for m_p_attribute_part,m_p_instances_per_classifier,m_p_id3_to_NBC,m_p_num_of_classifiers in tqdm(params_prod, total=len(params_prod), desc="Parameters variations", position=0):
                                    
        scores = []
        # Experiment repetitions loop
        for i in tqdm(range(experiment_repetitions), total=experiment_repetitions, desc="Experiment repetitions", position=1, leave=False) :
            # model = RandomForrest()
            model = NBC_Categorical()
            if dataset_name=="airline":
                model.fit(X_train, y_train)
                score = model.score(X_test, y_test)
            else:
                score = cross_validation_score(X, y, model)
                model_name = f"model"

            scores.append(score)

        final_score = np.mean(scores)
        models_results_df.loc[len(models_results_df)] = [model_name, 
                                                         final_score, 
                                                         dataset_name, 
                                                         m_p_attribute_part, 
                                                         m_p_instances_per_classifier, 
                                                         m_p_id3_to_NBC, 
                                                         m_p_num_of_classifiers]
        
    models_results_df.to_csv("result.csv", index=False)
                        



DATASET: exams


 outer:   0%|          | 0/1 [00:00<?, ?it/s]

 inner loop:   0%|          | 0/25 [00:00<?, ?it/s]

DATASET: e-commerce


 outer:   0%|          | 0/1 [00:00<?, ?it/s]

 inner loop:   0%|          | 0/25 [00:00<?, ?it/s]

DATASET: airline


 outer:   0%|          | 0/1 [00:00<?, ?it/s]

 inner loop:   0%|          | 0/25 [00:00<?, ?it/s]

## Exams

In [11]:
X, y = get_exams_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

print("\nNaive Bayes Classifier:")
nbc_classifier = NBC_Categorical()
nbc_classifier.fit(X_train, y_train)
nbc_classifier.eval(X_test, y_test)

print("\nID3:")
id3_tree = ID3()
id3_tree.fit(X_train, y_train)
id3_tree.eval(X_test, y_test)


Naive Bayes Classifier:
Accuracy: 0.2633333333333333

ID3:
Accuracy: 0.25


0.25

In [44]:
print("\nID3:")
id3_tree = ID3()
id3_tree.fit(X_train, y_train)
np.array(id3_tree.predict(X_test))


ID3:


array(['2', '3.5', '2', '3.5', '3.5', '3.5', '3.5', '3.5', '2', '2', '2',
       '4', '2', '2', '3', '4', '4', '3.5', '3.5', '2', '3.5', '4.5', '2',
       '4', '4', '2', '2', '4', '3', '4.5', '2', '3.5', '2', '3', '3.5',
       '3', '4', '4', '3.5', '2', '4', '4', '2', '4', '2', '3', '3',
       '3.5', '2', '3.5', '2', '3.5', '4', '3.5', '4.5', '3.5', '2', '2',
       '3.5', '4', '4', '2', '2', '2', '3.5', '4', '2', '4.5', '4.5',
       '4.5', '3', '4', '4', '4', '4.5', '3.5', '2', '4', '3', '3.5',
       '3.5', '4', '4.5', '3.5', '5', '3', '4', '4.5', '2', '4', '3.5',
       '4', '3', '3.5', '3.5', '4.5', '4', '5', '2', '2', '3.5', '3.5',
       '3', '2', '2', '3.5', '4', '4', '3', '4.5', '2', '2', '3', '4',
       '2', '2', '4', '3.5', '3.5', '4', '3.5', '3.5', '3', '3', '3', '2',
       '2', '3', '2', '3.5', '3', '3.5', '2', '3.5', '3.5', '3', '2',
       '3.5', '3', '4', '2', '3.5', None, '3.5', '3.5', '4', '4', '3.5',
       '3.5', '4.5', '4', '2', '3.5', '2', '3', '4', '3.5', '4

In [3]:
X, y = get_exams_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

print("\nRandom Forest:")
random_forest = RandomForestClf(n_clf=6)
random_forest.fit(X_train, y_train)
random_forest.eval(X_test, y_test)



Naive Bayes Classifier:
Accuracy: 0.13666666666666666


0.13666666666666666

## E-commerce

In [3]:
X, y = get_ecommerce_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

print("\nNaive Bayes Classifier:")
nbc_classifier = NBC_Categorical()
nbc_classifier.fit(X_train, y_train)
nbc_classifier.eval(X_test, y_test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

print("\nID3:")
id3_tree = ID3()
id3_tree.fit(X_train, y_train)
id3_tree.eval(X_test, y_test)


Naive Bayes Classifier:
Accuracy: 0.6915151515151515

ID3:


KeyboardInterrupt: 

## Airline passenger satisfaction

In [4]:
(X_train, y_train), (X_test, y_test) = get_airline_dataset("train"), get_airline_dataset("test")

print("\nNaive Bayes Classifier:")
nbc_classifier = NBC_Categorical()
nbc_classifier.fit(X_train, y_train)
nbc_classifier.eval(X_test, y_test)

(X_train, y_train), (X_test, y_test) = get_airline_dataset("train"), get_airline_dataset("test")

print("\nID3:")
id3_tree = ID3()
id3_tree.fit(X_train, y_train)
id3_tree.eval(X_test, y_test)


Naive Bayes Classifier:


KeyboardInterrupt: 