# `emlearn`

In [None]:
import os.path
import os
import emlearn
import numpy
import pandas
import matplotlib.pyplot as plt

try:
    # When executed as regular .py script
    os.environ["EMLEARN_INCLUDE_DIR"] = emlearn.includedir
    os.environ["LIBRARY_PATH"] = emlearn.includedir
    print(os.environ.get("LIBRARY_PATH"))
    here = os.path.dirname(__file__)
    
except NameError:
    # When executed as Jupyter notebook / Sphinx Gallery
    here = os.getcwd()

## Create dataset

In [None]:
import pandas as pd

def load_dataset():
    data = pd.read_csv("../data/extracted_features/features_like_artigo.csv")
    data = data.drop(columns=['Mean', 'Median']) # Remove the mean and median column

    return data

def get_data():
    data = pd.read_csv("../data/extracted_features/features_like_artigo.csv")
    X = data.iloc[:, 0:-1]                 # All column except the last one
    X = X.drop(columns=['Mean', 'Median']) # Remove the mean and median column
    y = data.iloc[:, -1]                   # Last Column

    return X,y

X_data, y_data = get_data()
dataset = load_dataset()

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
print(len(y_data))
train, test  = train_test_split(dataset, test_size=0.3, random_state=42)
print(len(test['FalutID']) + len(train['RMS']))

## Correctness checking

In [None]:
def check_correctness(out_dir, model_filename, test_data, test_predictions, feature_columns):
    test_res = numpy.array(test_predictions).flatten()

    test_dataset = "\n".join([
        emlearn.cgen.array_declare(f"{name}_testset_data", dtype='float', values=test_data),
        emlearn.cgen.array_declare(f"{name}_testset_results", dtype='int', values=test_res),
        emlearn.cgen.constant_declare(f'{name}_testset_features', val=len(feature_columns)),
        emlearn.cgen.constant_declare(f'{name}_testset_samples', val=len(test_predictions)),
    ])

    test_code = test_dataset + \
    f'''
    #include "{model_filename}" // emlearn generated model

    #include <stdio.h> // printf

    int
    {name}_test() {{
        const int n_features = {name}_testset_features;
        const int n_testcases = {name}_testset_samples;

        int errors = 0;

        for (int i=0; i<n_testcases; i++) {{
            const float *features = {name}_testset_data + (i*n_features);
            const int expect_result = {name}_testset_results[i*1];

            const int32_t out = model_predict(features, n_features);

            if (out != expect_result) {{
                printf(\"test-fail sample=%d expect=%d got=%d \\n\", i, expect_result, out);
                errors += 1;
            }}
            printf(\"test sample=%d expect=%d got=%d \\n\", i, expect_result, out);

        }}
        return errors;
    }}

    int
    main(int argc, const char *argv[])
    {{
        const int errors = {name}_test();
        printf(\"Errors: %d \\n\", errors);
        return errors;
    }}'''

    test_source_file = os.path.join(out_dir, f'test_{name}.c')
    with open(test_source_file, 'w') as f:
        f.write(test_code)

    print('Generated', test_source_file)
    print(f"Outdir: {out_dir}")
    include_dirs = [ emlearn.common.get_include_dir() ]
    test_executable = emlearn.common.compile_executable(
            test_source_file,
            out_dir,
            name=f'test_{name}',
            include_dirs=include_dirs
    )

    import subprocess
    errors = None
    try:
        print("TRY")
        subprocess.check_output(test_executable)
        errors = 0
        print("ERROR")
    except subprocess.CalledProcessError as e:
        errors = e.returncode
        print(f"CATCH {e.returncode}")

    return errors

## Plotting tools

In [None]:
def plot_results(ax, model, X, y):
    from sklearn.inspection import DecisionBoundaryDisplay

    # show classification boundaries
    DecisionBoundaryDisplay.from_estimator(
        model, X, alpha=0.4, ax=ax, response_method="auto",
    )

    # show datapoints
    ax.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, s=20, edgecolor="k")

## Train, convert and run model

In [None]:
def build_run_classifier(model, name):
    from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
    from sklearn.metrics import get_scorer

    feature_columns = ['RMS','Variance','Skewness','Kurtosis', 'CrestFactor','ShapeFactor','ImpulseFactor','MarginFactor', 'Peak1','Peak2','Peak3','PeakLocs1','PeakLocs2','PeakLocs3']
    feature_columns_plt = ['RMS','Variance']
    target_column = "FalutID"

    # Train model
    train, test = train_test_split(dataset, test_size=0.3, random_state=42)
    feature_columns = list(set(dataset.columns) - set([target_column]))
    
    # Realizar a validação cruzada com k = 5
    # skf = StratifiedKFold(n_splits=5, shuffle=True)
    # cv_scores = cross_val_score(model, X_data, y_data, cv=skf)

    # limit to 2 columns to be able to visualize
    #feature_columns = ['total_phenols', 'color_intensity']
    #feature_columns = ['alcohol', 'flavanoids']

    print(f"Len test[feature_columns]: {len(test['RMS'])}")
    print(f'Len test[target_column]: {len(test[target_column])}')

    model.fit(train[feature_columns], train[target_column])
    accuracy = get_scorer('accuracy')(model, test[feature_columns], test[target_column])

    # Convert model
    out_dir = os.path.join(here, 'classifiers')
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    model_filename = os.path.join(out_dir, f'{name}_model.h')
    
    print(f"Converting {name} model...")
    cmodel = emlearn.convert(model)
    
    print(f"Saving {name} model to file {model_filename}....")
    code = cmodel.save(file=model_filename, name='model')
    print(f"Model {name} saved!")


    # Test converted model
    # test_pred = cmodel.predict(test[feature_columns].values)
    test_pred = test[target_column] -1
    test_pred = test_pred
    print(type(test_pred))
    
    # Generate a test dataset
    test_data = numpy.array(test[feature_columns]).flatten()
    print(type(test_data))
    # test_data = test[feature_columns].values

    errors = check_correctness(out_dir, model_filename, test_data, test_pred, feature_columns)
        
    print(f"Tested {name}: {errors} errors  {1 - (errors/len(test[target_column]))}")
    print(f"Tested {name} - accuracy: {accuracy}")

    #plot_results(ax, model, X_data[feature_columns_plt], y_data[target_column])
    #plot_results(ax, model, test[feature_columns], test[target_column])

    return 

## Classifiers to compare

In [None]:
# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.svm import SVC, SVR
# from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.neural_network import MLPClassifier

# classifiers = {
#     'bagging_tree': BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42),
#     'quadratic_svm': SVC(kernel='poly', degree=2, C=2),
#     'decision_tree': DecisionTreeClassifier(),
#     'gaussian_naive_bayes': GaussianNB(),
#     'knn_classifier': KNeighborsClassifier(n_neighbors=5),
#     'sklearn_mlp': MLPClassifier(hidden_layer_sizes=(10,10,), max_iter=1000, random_state=42),
# }

import sklearn.ensemble
import sklearn.tree
import sklearn.neural_network
import sklearn.naive_bayes

classifiers = {
    'decision_tree': sklearn.tree.DecisionTreeClassifier(),
    'random_forest': sklearn.ensemble.RandomForestClassifier(n_estimators=10, random_state=42),
    'extra_trees': sklearn.ensemble.ExtraTreesClassifier(n_estimators=10, random_state=42),
    'gaussian_naive_bayes': sklearn.naive_bayes.GaussianNB(),
    # 'sklearn_mlp': sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(10,10,), max_iter=1000, random_state=1),
}

## Run all classifiers

In [None]:
for (name, cls) in classifiers.items():
    print(f'Training {name}')
    build_run_classifier(cls, name)
    print("----------------------------------------------------------------")