# TrainClassifier

+ Faz o treinamento e avaliacao da acuracia dos classificadores utilizando um arquivo `.csv` gerado pelo notebook `Main.ipynb` 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import emlearn
import os


In [None]:
# Carregar os dados
data = pd.read_csv("../data/extracted_features/features_like_artigo.csv")
X = data.iloc[:, 0:-1]                # All column except the last one
X = X.drop(columns=['Mean', 'Median']) # Remove the mean and median column
y = data.iloc[:, -1]                  # Last Column

## Train Desktop Models
---

+ Treinamento indireto dos modelos semelhantes ao do artigo utilizando a funcionalidade do sklearn `cross_val_score`

### Bagged Trees Ensemble

In [None]:
# Criar o classificador base (uma única árvore de decisão)
base_classifier = DecisionTreeClassifier()

# Criar o classificador Bagged Trees Ensemble
bagged_classifier = BaggingClassifier(estimator=base_classifier, n_estimators=10, random_state=42)

# Realizar a validação cruzada com k = 5
skf = StratifiedKFold(n_splits=5, shuffle=True)
bgtree_scores = cross_val_score(bagged_classifier, X, y, cv=skf)

print("Acurácia em cada fold :", bgtree_scores)
print("Acurácia média        :", bgtree_scores.mean())

### Quadratic SVM

In [None]:
# Criar o classificador SVM Quadrático
svm_classifier = SVC(kernel='poly',degree=2,C=2)  # degree=2 para o kernel quadrático

# Realizar a validação cruzada com k = 5
skf = StratifiedKFold(n_splits=5, shuffle=True)
svm_cv_scores = cross_val_score(svm_classifier, X, y, cv=skf)

# Imprimir as acurácias de cada fold e a acurácia média
print("Acurácia em cada fold :", svm_cv_scores)
print("Acurácia média        :", svm_cv_scores.mean())

### Fine Decision Tree

In [None]:
# Criar o classificador Fine Decision Tree
fine_tree_classifier = DecisionTreeClassifier()

# Realizar a validação cruzada com k = 5
skf = StratifiedKFold(n_splits=5, shuffle=True)
fdtree_cv_scores = cross_val_score(fine_tree_classifier, X, y, cv=skf)

# Imprimir as acurácias de cada fold e a acurácia média
print("Acurácia em cada fold :", fdtree_cv_scores)
print("Acurácia média        :", fdtree_cv_scores.mean())

### Naïve Bayes

In [None]:
# Criar o classificador Naïve Bayes
naive_bayes_classifier = GaussianNB()

# Realizar a validação cruzada com k = 5
skf = StratifiedKFold(n_splits=5, shuffle=True)
nb_cv_scores = cross_val_score(naive_bayes_classifier, X, y, cv=skf)

# Imprimir as acurácias de cada fold e a acurácia média
print("Acurácia em cada fold :", nb_cv_scores)
print("Acurácia média        :", nb_cv_scores.mean())

### KNN

In [None]:
# Criar o classificador KNN
knn_classifier = KNeighborsClassifier(n_neighbors=3)  # Número de vizinhos = 3 (pode ser ajustado)

# # Realizar a validação cruzada com k = 5
skf = StratifiedKFold(n_splits=5, shuffle=True)
knn_cv_scores = cross_val_score(knn_classifier, X, y, cv=skf)

# Imprimir as acurácias de cada fold e a acurácia média
print("Acurácia em cada fold :", knn_cv_scores)
print("Acurácia média        :", knn_cv_scores.mean())

In [None]:
# Resumo
print(f"Bagged Trees Ensemble: {bgtree_scores.mean()}")
print(f"Quadratic SVM:         {svm_cv_scores.mean()}")
print(f"Fine Decision Tree :   {fdtree_cv_scores.mean()}")
print(f"Naïve Bayes :          {nb_cv_scores.mean()}")
print(f"KNeighbors (KNN) :     {knn_cv_scores.mean()}")

## Training MCU Classifiers
---

In [None]:
import emlearn
from os import path
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import get_scorer

In [None]:
feature_columns = ['RMS','Variance','Skewness','Kurtosis','CrestFactor','ShapeFactor','ImpulseFactor','MarginFactor','Peak1','Peak2','Peak3','PeakLocs1','PeakLocs2','PeakLocs3']
feature_columns_plt = ['Mean', 'Median']
target_column = "FaultID" # ta errado porque no csv eu gerei com o nome errado

In [None]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42, stratify=data['FaultID'])
feature_columns = list(set(data.columns) - set([target_column]))
#test_test["FaultID"].value_counts() / len(test_set["FaultID"]) # checking classes distribution
#train_set["FaultID"].value_counts() / len(train_set["FaultID"]) # checking classes distribution

In [None]:
classifiers_emlearn = {
    'decision_tree': DecisionTreeClassifier(),
    'random_forest': RandomForestClassifier(n_estimators=10, random_state=42),
    'extra_trees': ExtraTreesClassifier(n_estimators=10, random_state=42),
    'gaussian_naive_bayes': GaussianNB(),
    'knn' : KNeighborsClassifier(n_neighbors=3)
    # 'sklearn_mlp': sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(10,10,), max_iter=1000, random_state=1),
}

In [None]:
def check_correctness(out_dir, model_filename, test_data, test_predictions, feature_columns):
    test_res = np.array(test_predictions).flatten()

    test_dataset = "\n".join([
        emlearn.cgen.array_declare(f"{name}_testset_data", dtype='float', values=test_data),
        emlearn.cgen.array_declare(f"{name}_testset_results", dtype='int', values=test_res),
        emlearn.cgen.constant_declare(f'{name}_testset_features', val=len(feature_columns)),
        emlearn.cgen.constant_declare(f'{name}_testset_samples', val=len(test_predictions)),
    ])

    test_code = test_dataset + \
    f'''
    #include "{model_filename}" // emlearn generated model

    #include <stdio.h> // printf

    int
    {name}_test() {{
        const int n_features = {name}_testset_features;
        const int n_testcases = {name}_testset_samples;

        int errors = 0;

        for (int i=0; i<n_testcases; i++) {{
            const float *features = {name}_testset_data + (i*n_features);
            const int expect_result = {name}_testset_results[i*1];

            const int32_t out = model_predict(features, n_features);

            if (out != expect_result) {{
                printf(\"test-fail sample=%d expect=%d got=%d \\n\", i, expect_result, out);
                errors += 1;
            }}
            printf(\"test sample=%d expect=%d got=%d \\n\", i, expect_result, out);

        }}
        return errors;
    }}

    int
    main(int argc, const char *argv[])
    {{
        const int errors = {name}_test();
        printf(\"Errors: %d \\n\", errors);
        return errors;
    }}'''

    test_source_file = os.path.join(out_dir, f'test_{name}.c')
    with open(test_source_file, 'w') as f:
        f.write(test_code)

    print('Generated', test_source_file)
    print(f"Outdir: {out_dir}")
    include_dirs = [ emlearn.common.get_include_dir() ]
    test_executable = emlearn.common.compile_executable(
            test_source_file,
            out_dir,
            name=f'test_{name}',
            include_dirs=include_dirs
    )

    import subprocess
    errors = None
    try:
        print("TRY")
        subprocess.check_output(test_executable)
        errors = 0
        print("ERROR")
    except subprocess.CalledProcessError as e:
        errors = e.returncode
        print(f"CATCH {e.returncode}")

    return errors

In [None]:
def plot_results(ax, model, X, y):
    from sklearn.inspection import DecisionBoundaryDisplay

    # show classification boundaries
    DecisionBoundaryDisplay.from_estimator(
        model, X, alpha=0.4, ax=ax, response_method="auto",
    )

    # show datapoints
    ax.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, s=20, edgecolor="k")

In [None]:
def build_run_classifier(model, name, train, test):

    X_data = X
    y_data = y

    # Realizar a validação cruzada com k = 5
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    cv_scores = cross_val_score(model, X_data, y_data, cv=skf)
    print(f"{name} - CV_SCORE : {cv_scores.mean()}")

    print(f"Len test[feature_columns]: {len(test['RMS'])}")
    print(f'Len test[target_column]: {len(test[target_column])}')

    # Train model
    model.fit(train[feature_columns], train[target_column])
    accuracy = get_scorer('accuracy')(model, test[feature_columns], test[target_column])

    # Convert model
    out_dir = os.path.join(os.getcwd(), '../data/c_models')
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    model_filename = os.path.join(out_dir, f'{name}_model.h')

    print(f"Converting {name} model...")
    cmodel = emlearn.convert(model, dtype='float')

    print(f"Saving {name} model to file {model_filename}....")
    code = cmodel.save(file=model_filename, name='model')
    print(f"Model {name} saved!")


    # Test converted model
    # test_pred = cmodel.predict(test[feature_columns].values)
    test_pred = test[target_column] -1
    test_pred = test_pred
    print(type(test_pred))

    # Generate a test dataset
    test_data = np.array(test[feature_columns]).flatten()
    print(type(test_data))
    # test_data = test[feature_columns].values

    # errors = check_correctness(out_dir, model_filename, test_data, test_pred, feature_columns)

    # print(f"Tested {name}: {errors} errors  {1 - (errors/len(test[target_column]))}")
    print(f"Tested {name} - accuracy: {accuracy}")

    # plot_results(ax, model, X_data[feature_columns_plt], y_data[target_column])
    # plot_results(ax, model, test[feature_columns], test[target_column])

    return 

In [None]:
for (name, cls) in classifiers_emlearn.items():
    print(f'Training {name}')
    build_run_classifier(cls, name, train_set, test_set)
    print("----------------------------------------------------------------")