#### <ins>Imports/Installs/Global Vars</ins>



##### Installing required packages (if missing)

In [5]:
# !pip install pandas
# !pip install matplotlib
# !pip install scikit-learn
# !pip install xgboost

##### Import required libs

In [6]:
import pandas as pd

import matplotlib.pyplot as pyplot

from sklearn import metrics
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

##### Defining paths

In [7]:
datasetsPath = '../datasets/harus/'

##### Importing Dataset.csv

In [40]:
# Get a list of the features names for the dataset
# Features und deren Werte sind in verschiedenen txt-Dateien gespeichert
# Es ist daher notwendig die txt-Dateien separat auszulesen, aufzuarbeiten und zu einem Dataframe zusammenzufuegen
# Die Daten folgen dabei keiner gaengingen Formatierung
# features.txt teilt die einzelnen Eintraege nach Spalten
# Jede Spalte besteht aus einem Index und dem Namen des Features, getrennt durch ein Leerzeichen

with open(datasetsPath + 'UCI HAR Dataset/features.txt') as f:
    xNames = [] # List of column/feature names

    for line in f:                      # Reading each line
        parts = line.strip().split(' ') # Splitting each line by space

        if len(parts) > 1:              # If the line has more than 1 element
            label = parts[1]            # The second element is the label
            xNames.append(label)

data = pd.DataFrame(columns=xNames)     # Create  dataframe with columns named after features.txt

# Die Trainingsdaten sind ebenfalls in einer txt-Datei gespeichert
# Jede Reihe in einem Dataframe wird durch eine Zeile in der txt-Datei repraesentiert
# Die einzelnen Werte werden durch (eine variierende Anzahl an ) Leerzeichen getrennt
# Beim Einlesen werden die Zahlenwerte zudem als Strings erkannt und muessen noch gecastet werden

# with open(datasetsPath + 'UCI HAR Dataset/train/X_train.txt') as f:
#     content = f.read()
#     new_content = content.replace('  ', ' ')        # Remove double spaces

# with open(datasetsPath + 'UCI HAR Dataset/train/X_train_new.txt', 'w') as f:
#     f.write(new_content)                            # Write to new file     

with open(datasetsPath + 'UCI HAR Dataset/train/X_train_new.txt', 'r') as f:
    for line in f:
        list = line.strip().split(' ')
        new_list = [float(i) for i in list]
        print(f'Value: {new_list[0]} | Type: {type(new_list[0])}')
        data.loc[len(data)] = new_list

print(data.head())

Value: 0.28858451 | Type: <class 'float'>
Value: 0.27841883 | Type: <class 'float'>
Value: 0.27965306 | Type: <class 'float'>
Value: 0.27917394 | Type: <class 'float'>
Value: 0.27662877 | Type: <class 'float'>
Value: 0.27719877 | Type: <class 'float'>
Value: 0.27945388 | Type: <class 'float'>
Value: 0.27743247 | Type: <class 'float'>
Value: 0.27729342 | Type: <class 'float'>
Value: 0.28058569 | Type: <class 'float'>
Value: 0.27688027 | Type: <class 'float'>
Value: 0.27622817 | Type: <class 'float'>
Value: 0.278457 | Type: <class 'float'>
Value: 0.27717497 | Type: <class 'float'>
Value: 0.29794572 | Type: <class 'float'>
Value: 0.27920345 | Type: <class 'float'>
Value: 0.27903836 | Type: <class 'float'>
Value: 0.2801349 | Type: <class 'float'>
Value: 0.27773106 | Type: <class 'float'>
Value: 0.27556818 | Type: <class 'float'>
Value: 0.27756171 | Type: <class 'float'>
Value: 0.27715238 | Type: <class 'float'>
Value: 0.2756763 | Type: <class 'float'>
Value: 0.2792002 | Type: <class 'float

##### Inspecting The Dataset

In [None]:
def printSummaryStatistics():   # Prints statistical for each column in the dataframe
    dataCols = data.columns.to_list()
    for col in dataCols:
        print(f"Column: {col} \n{data[col].describe()} \nData Type: {data[col].dtype}\n")

print(f'{data.head()}\n')       # Looking into basic structure
printSummaryStatistics()


#### <ins>Data Preprocessing and Training</ins>

##### Initializing Encoders

In [None]:
leGender = LabelEncoder()       # Female, Male
leHML = LabelEncoder()          # High, Moderate, Low
leChestPain = LabelEncoder()    # Non-anginal, Asymptomatic, Typical, Atypical
leThalassemia = LabelEncoder()  # Normal, Fixed Defect, Reversible Defect
leECG = LabelEncoder()          # Normal, ST-T abnormality, Left ventricular hypertrophy

##### Encoding Categorical Columns

In [None]:
data['Gender'] = leGender.fit_transform(data['Gender'])

data['Physical_Activity_Level'] = leHML.fit_transform(data['Physical_Activity_Level'])
data['Stress_Level'] = leHML.fit_transform(data['Stress_Level'])
data['Heart_Attack_Risk'] = leHML.fit_transform(data['Heart_Attack_Risk'])

data['Chest_Pain_Type'] = leChestPain.fit_transform(data['Chest_Pain_Type'])
data['Thalassemia'] = leThalassemia.fit_transform(data['Thalassemia'])
data['ECG_Results'] = leECG.fit_transform(data['ECG_Results'])

##### Forming Dataset into Training, Test, Eval, Features and Labels

In [None]:
x_data = data.iloc[:,:19]       # Features
y_data = data.iloc[:,19:]       # Labels

xtrain, xtest, ytrain, ytest = train_test_split(
    x_data, 
    y_data, 
    test_size=0.2,
    random_state=0
)

evaldata=[(xtrain,ytrain),(xtest,ytest)]          # Datensatz zur Evaluierung

##### Training and Improving<br>
Um eine gute Anzahl an Estimators zu bestimmen, wird zuerst ein Modell mithilfe von Early Stopping, sowie einer großen Menge an Estimatoren trainiert. Hiermit wird die beste Anzahl an Iterationen ermittelt und mit dieser Anzahl ein weiteres Modell trainiert.

In [None]:
donor = XGBClassifier(              # "Spendermodell"
    objective='multi:softmax',      # Multi-Klassifizierung
    num_class=3,
    learning_rate=0.1,
    n_estimators=10000,             # "Große Anzahl an Schaetzern, die nicht erreicht werden soll"
    early_stopping_rounds=50,       # Anzahl an Runden, bei denen sich das Modell nicht verbessern muss, bis abgebrochen wird
    # max_depth=3                   # Erstmal weglassen
)

donor.fit( # Donor model
    xtrain, 
    ytrain, 
    eval_set=evaldata, 
    verbose=False
)

bIter = donor.best_iteration        # Beste Anzahl an Estimatoren

model = XGBClassifier(
    objective='multi:softmax',  # Specify the multi-class classification task
    num_class=3,                # Number of classes (Low, Moderate, High)
    learning_rate=0.1,          # Learning rate for the model
    n_estimators=bIter,         # Number of boosting rounds (iterations)
    num_parallel_tree=1         # m2c workaround
    # max_depth=3,              # Maximum depth of the trees
)

model.fit( # Final model
    xtrain, 
    ytrain, 
    eval_set=evaldata, 
    verbose=False
)

print(model.classes_)

yhat = model.predict(xtest)

#### <ins>Func Definitions</ins>

##### Performance Metrics and Evaluation

In [None]:
def printConfusionMatrix(): # Confusion Matrix
    metrics.ConfusionMatrixDisplay.from_estimator(model, xtest, ytest, cmap='Blues')
    pyplot.show()

def plotLossCurves():       # Loss Curves
    # save evaluation results
    results = model.evals_result()
    # plot curves
    lossValue = list(results['validation_1'])[0]
    pyplot.plot(results['validation_0'][lossValue], label='train')
    pyplot.plot(results['validation_1'][lossValue], label='train')
    # show the legend
    pyplot.xlabel('Iterations')
    pyplot.ylabel('Log Loss')
    pyplot.legend()
    # show the plot
    pyplot.show()

def printClassReport():     # Classification Report
    # Report
    print(metrics.classification_report(ytest, yhat, digits = 3))

def printMisc():            # Best Iter, Test Accuracy, Base Score, Probas,
    # Misc
    print(f'# Trees / Best Iteration: \t{bIter}')
    print(f'Test Accuracy: \t{accuracy_score(ytest, yhat)}')
    print(f'Base_Score{model.base_score}')
    print(f'\nPredict_Proba Return: \n{model.predict_proba(xtest)}')

##### Porting this Bitch

In [None]:
def portToC(model):
    import m2cgen as m2c

    with open('../exported_models/currentExport.c','w') as f:
        code = m2c.export_to_c(model)
        f.write(code)
        

##### Generating Code for Lazy People

In [None]:
def genInfer(start=0, size=500, csv=True, float=True):
    start = start
    size = size
    length = 2
    
    # Declaring function
    print(f'void infer(int time, int csv) {{')

    # Printing Header
    print(f'\tif(csv==1){{')
    print(f'\t\tSerial.println("aScore0,aScore1");        // Printing header to name columns in csv')
    print(f'\t}} else {{')
    print(f'\t\tSerial.println("Start: {start} | End: {start+size}");    // Printing Range:')
    print(f'\t}}')

    print(f'\t// Declarations:')
    print(f'\tint length = {length};')
    if float == True:
        print(f'\tfloat result[length];')
    else:
        print(f'\tdouble result[length];')

    print(f'\t// Model Inference')
    for x in range(start,(start+size)):  
        
        if float == True:
            print(f'\tfloat x_{x}[] = {{' , end="")    
        else:
            print(f'\tdouble x_{x}[] = {{' , end="")

        features = xtest.values[x]

        for i in range(len(features)):
            if i < (len(features)-1):
                print(features[i], end=", ")
            else:
                print(features[i], end="};\n")

        if csv == False:
            print(f'\tint y_{x} = {yhat[x]};')

        print(f'\tscore(x_{x}, result);')
        
        if csv == True:
            print(f'\tprintScoreCSV(result, length);')
        else:
            print(f'\tprintScoreCompare(result, length, y_{x});')
        
        print(f'\tdelay(time);\n')
    print(f'}}')

##### <ins>Generating Inference Data</ins>

In [None]:
def generateProbDF(localCapture=model,features_test=xtest):
    xtestlist = localCapture.predict_proba(features_test).tolist()
    list1 = []
    list2 = []
    list3 = []

    for x in xtestlist:
        list1.append(round(x[0],4))
        list2.append(round(x[1],4))
        list3.append(round(x[2],4))

    probDF = pd.DataFrame({
        'Label': localCapture.predict(features_test),
        'Prob0': list1,
        'Prob1': list2,
        'Prob2': list3
    })
    return(probDF)

def exportProbDF(probDF = generateProbDF()):
    probDF.to_csv(datasetsPath + 'baseCapture.csv')

def importInoCapture():
    serial = pd.read_csv(datasetsPath + 'inoCapture.csv')
    serial = serial.truncate(after=(len(serial)-2)) # get rid of ##### REPEATING... #####
    return(serial)

def generateComparison(probDF=generateProbDF(),inoCapture=importInoCapture()):
    probDF = probDF.truncate(after=(len(inoCapture)-1))
    probDF = probDF.join(inoCapture)
    probDF = probDF[['Label', 'Prob0', 'aScore0', 'Prob1', 'aScore1', 'Prob2', 'aScore2']] # Rearranging columns
    probDF.to_csv(datasetsPath + 'compareCaptures.csv')

#### <ins>Main</ins>

##### Evaluation Metrics

In [None]:
printConfusionMatrix()
plotLossCurves()
printClassReport()
printMisc()

##### Code Generation

In [None]:
# genInfer(start=0, size=100, csv=True, float=False)

##### Generate C-Port

In [None]:
# portToC(model)

##### Generate Inference Data and Comparison

In [None]:
# generateProbDF()      # params: localCapture (model to capture data from), features_test
# exportProbDF()        # params: probDF (probDF export)
# importInoCapture()    
# generateComparison()  # params: probDF, inoCapture