### <ins>Imports/Installs</ins>


#### Required package installs

In [None]:
# !pip install pandas
# !pip install matplotlib
# !pip install scikit-learn
# !pip install xgboost
# !pip install m2cgen

#### Import required libs

In [None]:
import pandas as pd

import matplotlib.pyplot as pyplot

from sklearn import metrics
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#### Defining paths

In [None]:
datasetsPath = '../datasets/gyro/'

#### Importing Dataset.csv

In [None]:
gyro = pd.read_csv(datasetsPath + 'gyro_mobile.csv')

#### Inspecting The Dataset

In [None]:
def printSummaryStatistics():   # Prints statistical for each column in the dataframe
    gyroCols = gyro.columns.to_list()
    for col in gyroCols:
        print(f"Column: {col} \n{gyro[col].describe()} \nData Type: {gyro[col].dtype}\n")


Insights:
- 31991 data points
- Every feature is continuous
- Activity is either 1 or 0 (binary classification)
- Dataset contains a timestamp that might be dropped

### <ins>Data Preprocessing and Training</ins>

#### Dropping timestamp and splitting data into Training, Testing and Eval

In [None]:
gyro = gyro.drop(columns='timestamp')

xtrain, xtest, ytrain, ytest = train_test_split(
    gyro.iloc[:,:6],
    gyro.iloc[:,6:],
    test_size=0.2,
    random_state=0
)

evaldata=[(xtrain,ytrain),(xtest,ytest)]          # Datensatz zur Evaluierung

#### Training and Improving<br>
Um eine gute Anzahl an Estimators zu bestimmen, wird zuerst ein Modell mithilfe von Early Stopping, sowie einer großen Menge an Estimatoren trainiert. Hiermit wird die beste Anzahl an Iterationen ermittelt und mit dieser Anzahl ein weiteres Modell trainiert.

In [None]:
preModel = XGBClassifier(           # "Spendermodell"
    objective='binary:logistic',
    n_estimators=10000,             # "Große Anzahl an Schaetzern, die nicht erreicht werden soll"
    early_stopping_rounds=20,       # Anzahl an Runden, bei denen sich das Modell nicht verbessern muss, bis abgebrochen wird
    max_depth=2,
    learning_rate=0.1
)

preModel.fit(
    xtrain, 
    ytrain, 
    eval_set=evaldata, 
    verbose=False
)

bIter = preModel.best_iteration     # Beste Anzahl an Estimatoren

model = XGBClassifier(
    objective='binary:logistic',
    # tree_method = 'exact',
    n_estimators=bIter,
    max_depth=2,
    learning_rate=0.1,
    base_score=0.5
)

model.fit(xtrain, 
    ytrain, 
    eval_set=evaldata, 
    verbose=False
)



### <ins>Func Definitions</ins>

#### Performance Metrics and Evaluation

In [None]:
def printConfusionMatrix(): # Confusion Matrix
    metrics.ConfusionMatrixDisplay.from_estimator(model, xtest, ytest, cmap='Blues')
    pyplot.show()

def plotLossCurves():       # Loss Curves
    # save evaluation results
    results = model.evals_result()
    # plot curves
    lossValue = list(results['validation_1'])[0]
    pyplot.plot(results['validation_0'][lossValue], label='train')
    pyplot.plot(results['validation_1'][lossValue], label='train')
    # show the legend
    pyplot.xlabel('Iterations')
    pyplot.ylabel('Log Loss')
    pyplot.legend()
    # show the plot
    pyplot.show()

def printClassReport():     # Classification Report
    # Report
    print(metrics.classification_report(ytest, yhat, digits = 3))

def printMisc():            # Best Iter, Test Accuracy, Base Score, Probas,
    # Misc
    print(f'# Trees / Best Iteration: \t{bIter}')
    print(f'Test Accuracy: \t{accuracy_score(ytest, yhat)}')
    print(f'Base_Score{model.base_score}')
    print(f'\nPredict_Proba Return: \n{model.predict_proba(xtest)}')

def dynLossCurves(model):
    results = model.evals_result()
    names = list(model.evals_result())
    lossValue = list(results[names[0]])[0]
    for i in names:
        pyplot.plot(results[i][lossValue], label=i)
    pyplot.ylabel(lossValue)
    pyplot.xlabel('Iterations')
    pyplot.legend()
    pyplot.show()

dynLossCurves(model)

#### Porting the Model to C

In [None]:
def portToC(model):
    import m2cgen as m2c

    with open('../exports/gyroExport.c','w') as f:
        code = m2c.export_to_c(model)
        f.write(code)

    print('Model exported to: "arduino-xgboost/exports/gyroExport.c"')

#### genInfer(): Generating Code for Lazy People

In [None]:
def genInfer(start=0, size=500, csv=True, float=True):
    start = start
    size = size
    length = 2
    
    # Declaring function
    print(f'void infer(int time, int csv) {{')

    # Printing Header
    print(f'\tif(csv==1){{')
    print(f'\t\tSerial.println("aScore0,aScore1");        // Printing header to name columns in csv')
    print(f'\t}} else {{')
    print(f'\t\tSerial.println("Start: {start} | End: {start+size}");    // Printing Range:')
    print(f'\t}}')

    print(f'\t// Declarations:')
    print(f'\tint length = {length};')
    if float == True:
        print(f'\tfloat result[length];')
    else:
        print(f'\tdouble result[length];')

    print(f'\t// Model Inference')
    for x in range(start,(start+size)):  
        if float == True:
            print(f'\tfloat x_{x}[] = {{' , end="")    
        else:
            print(f'\tdouble x_{x}[] = {{' , end="")
        features = xtest.values[x]
        for i in range(len(features)):
            if i < (len(features)-1):
                print(features[i], end=", ")
            else:
                print(features[i], end="};\n")
        print(f'\tint y_{x} = {yhat[x]};')
        print(f'\tscore(x_{x}, result);')
        if csv == True:
            print(f'\tprintScoreCSV(result, length, y_{x});')
        else:
            print(f'\tprintScoreCompare(result, length, y_{x});')
        
        print(f'\tdelay(time);\n')
    print(f'}}')

#### Generating Inference Data

In [None]:
def generateProbDF(localCapture=model,features_test=xtest):
    xtestlist = localCapture.predict_proba(features_test).tolist()
    list1 = []
    list2 = []

    for x in xtestlist:
        list1.append(round(x[0],4))
        list2.append(round(x[1],4))

    probDF = pd.DataFrame({
        'Label': localCapture.predict(features_test),
        'Prob0': list1,
        'Prob1': list2
    })
    return(probDF)

def exportProbDF(probDF = generateProbDF()):
    probDF.to_csv(datasetsPath + 'baseCapture.csv')

def importInoCapture():
    serial = pd.read_csv(datasetsPath + 'inoCapture.csv')
    serial = serial.truncate(after=(len(serial)-2)) # get rid of ##### REPEATING... #####
    return(serial)

def generateComparison(probDF=generateProbDF(),inoCapture=importInoCapture()):
    probDF = probDF.truncate(after=(len(inoCapture)-1))
    probDF = probDF.join(inoCapture)
    probDF.to_csv(datasetsPath + 'compareCaptures.csv')

### <ins>Main</ins>

#### Inspecting Dataset

In [None]:
# print(f'{gyro.head()}\n')       # Looking into basic structure
# printSummaryStatistics()

#### Evaluation Metrics

In [None]:
# printConfusionMatrix()
# plotLossCurves()
# printClassReport()
# printMisc()

#### Code Generation

In [None]:
# genInfer(start=0, size=500, csv=True, float=False)

#### Generate C-Port

In [None]:
portToC(model)

#### Generate Inference Data and Comparison

In [None]:
# generateProbDF()      # params: localCapture (model to capture data from), features_test
# exportProbDF()        # params: probDF (probDF export)
# importInoCapture()    
# generateComparison()  # params: probDF, inoCapture