# Import Libs

In [None]:
import pandas as pd
from glob import glob
import matplotlib as mlt
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import scipy
from scipy.signal import butter, lfilter, filtfilt,argrelextrema
import scipy.stats as stats
import re
import copy
import math
import itertools
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix,mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.neighbors import KNeighborsClassifier,LocalOutlierFactor
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.cluster import KMeans


plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"]=(20,5)
plt.rcParams["figure.dpi"]=100
plt.rcParams["lines.linewidth"]=2

# Custom Modules

In [None]:
def count(df,cutoff=0.4,order=10,column="Accelerometer_r"):
    data=LowPass.low_pass_filter(df,column,fs,cutoff,order)
    indexes=argrelextrema(data[column+"_lowpass"].values,np.greater)
    peeks=data.iloc[indexes]
    fig,ax=plt.subplots()
    plt.plot(df[f"{column}_lowpass"])
    plt.plot(peeks[f"{column}_lowpass"],"o",color="red")
    ax.set_ylabel(f"{column}_lowpass")
    exercise=df["Label"].iloc[0].title()
    category=df["Category"].iloc[0].title()
    plt.title(f"{category} {exercise}: {len(peeks)} Reps")
    plt.show()
    return len(peeks)

# This class performs a Fourier transformation on the data to find frequencies that occur
# often and filter noise.
class FourierTransformation:
    
    def __init__(self):
        self.temp_list = []
        self.freqs = None

    # Find the amplitudes of the different frequencies using a fast fourier transformation. Here,
    # the sampling rate expresses
    # the number of samples per second (i.e. Frequency is Hertz of the dataset).
    
    def find_fft_transformation(self, data):
        # Create the transformation, this includes the amplitudes of both the real
        # and imaginary part.
        # print(data.shape)
        transformation = np.fft.rfft(data, len(data))
        # real
        real_ampl = transformation.real
        # max
        max_freq = self.freqs[np.argmax(real_ampl[0:len(real_ampl)])]
        # weigthed
        freq_weigthed = float(np.sum(self.freqs * real_ampl)) / np.sum(real_ampl)

        # pse

        PSD = np.divide(np.square(real_ampl), float(len(real_ampl)))
        PSD_pdf = np.divide(PSD, np.sum(PSD))

        # Make sure there are no zeros.
        if np.count_nonzero(PSD_pdf) == PSD_pdf.size:
            pse = -np.sum(np.log(PSD_pdf) * PSD_pdf)
        else:
            pse = 0

        real_ampl = np.insert(real_ampl, 0, max_freq)
        real_ampl = np.insert(real_ampl, 0, freq_weigthed)
        row = np.insert(real_ampl, 0, pse)

        self.temp_list.append(row)

        return 0

    # Get frequencies over a certain window.
    def abstract_frequency(self, data_table, columns, window_size, sampling_rate):
        self.freqs = (sampling_rate * np.fft.rfftfreq(int(window_size))).round(3)

        for col in columns:
            collist = []
            # prepare column names
            collist.append(col + '_max_freq')
            collist.append(col + '_freq_weighted')
            collist.append(col + '_pse')
            
            collist = collist + [col + '_freq_' +
                    str(freq) + '_Hz_ws_' + str(window_size) for freq in self.freqs]
           
            # rolling statistics to calculate frequencies, per window size. 
            # Pandas Rolling method can only return one aggregation value. 
            # Therefore values are not returned but stored in temp class variable 'temp_list'.

            # note to self! Rolling window_size would be nicer and more logical! In older version windowsize is actually 41. (ws + 1)
            data_table[col].rolling(
                window_size + 1).apply(self.find_fft_transformation)

            # Pad the missing rows with nans
            frequencies = np.pad(np.array(self.temp_list), ((window_size, 0), (0, 0)),
                        'constant', constant_values=np.nan)
            # add new freq columns to frame
            
            data_table[collist] = pd.DataFrame(frequencies, index=data_table.index)

            # reset temp-storage array
            del self.temp_list[:]
            

        
        return data_table

class ClassificationAlgorithms:

    # Forward selection for classification which selects a pre-defined number of features (max_features)
    # that show the best accuracy. We assume a decision tree learning for this purpose, but
    # this can easily be changed. It return the best features.
    def forward_selection(self, max_features, X_train, y_train):
        # Start with no features.
        ordered_features = []
        ordered_scores = []
        selected_features = []
        ca = ClassificationAlgorithms()
        prev_best_perf = 0

        # Select the appropriate number of features.
        for i in range(0, max_features):
            print(i)

            # Determine the features left to select.
            features_left = list(set(X_train.columns) - set(selected_features))
            best_perf = 0
            best_attribute = ""

            # For all features we can still select...
            for f in features_left:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.append(f)

                # Determine the accuracy of a decision tree learner if we were to add
                # the feature.
                (
                    pred_y_train,
                    pred_y_test,
                    prob_training_y,
                    prob_test_y,
                ) = ca.decision_tree(
                    X_train[temp_selected_features],
                    y_train,
                    X_train[temp_selected_features],
                )
                perf = accuracy_score(y_train, pred_y_train)

                # If the performance is better than what we have seen so far (we aim for high accuracy)
                # we set the current feature to the best feature and the same for the best performance.
                if perf > best_perf:
                    best_perf = perf
                    best_feature = f
            # We select the feature with the best performance.
            selected_features.append(best_feature)
            prev_best_perf = best_perf
            ordered_features.append(best_feature)
            ordered_scores.append(best_perf)
        return selected_features, ordered_features, ordered_scores

    # Apply a neural network for classification upon the training data (with the specified composition of
    # hidden layers and number of iterations), and use the created network to predict the outcome for both the
    # test and training set. It returns the categorical predictions for the training and test set as well as the
    # probabilities associated with each class, each class being represented as a column in the data frame.
    def feedforward_neural_network(
        self,
        train_X,
        train_y,
        test_X,
        hidden_layer_sizes=(100,),
        max_iter=2000,
        activation="logistic",
        alpha=0.0001,
        learning_rate="adaptive",
        gridsearch=True,
        print_model_details=False,
    ):

        if gridsearch:
            tuned_parameters = [
                {
                    "hidden_layer_sizes": [
                        (5,),
                        (10,),
                        (25,),
                        (100,),
                        (
                            100,
                            5,
                        ),
                        (
                            100,
                            10,
                        ),
                    ],
                    "activation": [activation],
                    "learning_rate": [learning_rate],
                    "max_iter": [1000, 2000],
                    "alpha": [alpha],
                }
            ]
            nn = GridSearchCV(
                MLPClassifier(), tuned_parameters, cv=5, scoring="accuracy"
            )
        else:
            # Create the model
            nn = MLPClassifier(
                hidden_layer_sizes=hidden_layer_sizes,
                activation=activation,
                max_iter=max_iter,
                learning_rate=learning_rate,
                alpha=alpha,
            )

        # Fit the model
        nn.fit(
            train_X,
            train_y.values.ravel(),
        )

        if gridsearch and print_model_details:
            print(nn.best_params_)

        if gridsearch:
            nn = nn.best_estimator_

        # Apply the model
        pred_prob_training_y = nn.predict_proba(train_X)
        pred_prob_test_y = nn.predict_proba(test_X)
        pred_training_y = nn.predict(train_X)
        pred_test_y = nn.predict(test_X)
        frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=nn.classes_)
        frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=nn.classes_)

        return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y

    # Apply a support vector machine for classification upon the training data (with the specified value for
    # C, epsilon and the kernel function), and use the created model to predict the outcome for both the
    # test and training set. It returns the categorical predictions for the training and test set as well as the
    # probabilities associated with each class, each class being represented as a column in the data frame.
    def support_vector_machine_with_kernel(
        self,
        train_X,
        train_y,
        test_X,
        kernel="rbf",
        C=1,
        gamma=1e-3,
        gridsearch=True,
        print_model_details=False,
    ):
        # Create the model
        if gridsearch:
            tuned_parameters = [
                {"kernel": ["rbf", "poly"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100]}
            ]
            svm = GridSearchCV(
                SVC(probability=True), tuned_parameters, cv=5, scoring="accuracy"
            )
        else:
            svm = SVC(
                C=C, kernel=kernel, gamma=gamma, probability=True, cache_size=7000
            )

        # Fit the model
        svm.fit(train_X, train_y.values.ravel())

        if gridsearch and print_model_details:
            print(svm.best_params_)

        if gridsearch:
            svm = svm.best_estimator_

        # Apply the model
        pred_prob_training_y = svm.predict_proba(train_X)
        pred_prob_test_y = svm.predict_proba(test_X)
        pred_training_y = svm.predict(train_X)
        pred_test_y = svm.predict(test_X)
        frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=svm.classes_)
        frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=svm.classes_)

        return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y

    # Apply a support vector machine for classification upon the training data (with the specified value for
    # C, epsilon and the kernel function), and use the created model to predict the outcome for both the
    # test and training set. It returns the categorical predictions for the training and test set as well as the
    # probabilities associated with each class, each class being represented as a column in the data frame.
    def support_vector_machine_without_kernel(
        self,
        train_X,
        train_y,
        test_X,
        C=1,
        tol=1e-3,
        max_iter=1000,
        gridsearch=True,
        print_model_details=False,
    ):
        # Create the model
        if gridsearch:
            tuned_parameters = [
                {"max_iter": [1000, 2000], "tol": [1e-3, 1e-4], "C": [1, 10, 100]}
            ]
            svm = GridSearchCV(LinearSVC(), tuned_parameters, cv=5, scoring="accuracy")
        else:
            svm = LinearSVC(C=C, tol=tol, max_iter=max_iter)

        # Fit the model
        svm.fit(train_X, train_y.values.ravel())

        if gridsearch and print_model_details:
            print(svm.best_params_)

        if gridsearch:
            svm = svm.best_estimator_

        # Apply the model

        distance_training_platt = 1 / (1 + np.exp(svm.decision_function(train_X)))
        pred_prob_training_y = (
            distance_training_platt / distance_training_platt.sum(axis=1)[:, None]
        )
        distance_test_platt = 1 / (1 + np.exp(svm.decision_function(test_X)))
        pred_prob_test_y = (
            distance_test_platt / distance_test_platt.sum(axis=1)[:, None]
        )
        pred_training_y = svm.predict(train_X)
        pred_test_y = svm.predict(test_X)
        frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=svm.classes_)
        frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=svm.classes_)

        return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y

    # Apply a nearest neighbor approach for classification upon the training data (with the specified value for
    # k), and use the created model to predict the outcome for both the
    # test and training set. It returns the categorical predictions for the training and test set as well as the
    # probabilities associated with each class, each class being represented as a column in the data frame.
    def k_nearest_neighbor(
        self,
        train_X,
        train_y,
        test_X,
        n_neighbors=5,
        gridsearch=True,
        print_model_details=False,
    ):
        # Create the model
        if gridsearch:
            tuned_parameters = [{"n_neighbors": [1, 2, 5, 10]}]
            knn = GridSearchCV(
                KNeighborsClassifier(), tuned_parameters, cv=5, scoring="accuracy"
            )
        else:
            knn = KNeighborsClassifier(n_neighbors=n_neighbors)

        # Fit the model
        knn.fit(train_X, train_y.values.ravel())

        if gridsearch and print_model_details:
            print(knn.best_params_)

        if gridsearch:
            knn = knn.best_estimator_

        # Apply the model
        pred_prob_training_y = knn.predict_proba(train_X)
        pred_prob_test_y = knn.predict_proba(test_X)
        pred_training_y = knn.predict(train_X)
        pred_test_y = knn.predict(test_X)
        frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=knn.classes_)
        frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=knn.classes_)

        return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y

    # Apply a decision tree approach for classification upon the training data (with the specified value for
    # the minimum samples in the leaf, and the export path and files if print_model_details=True)
    # and use the created model to predict the outcome for both the
    # test and training set. It returns the categorical predictions for the training and test set as well as the
    # probabilities associated with each class, each class being represented as a column in the data frame.
    def decision_tree(
        self,
        train_X,
        train_y,
        test_X,
        min_samples_leaf=50,
        criterion="gini",
        print_model_details=False,
        export_tree_path="Example_graphs/Chapter7/",
        export_tree_name="tree.dot",
        gridsearch=True,
    ):
        # Create the model
        if gridsearch:
            tuned_parameters = [
                {
                    "min_samples_leaf": [2, 10, 50, 100, 200],
                    "criterion": ["gini", "entropy"],
                }
            ]
            dtree = GridSearchCV(
                DecisionTreeClassifier(), tuned_parameters, cv=5, scoring="accuracy"
            )
        else:
            dtree = DecisionTreeClassifier(
                min_samples_leaf=min_samples_leaf, criterion=criterion
            )

        # Fit the model

        dtree.fit(train_X, train_y.values.ravel())

        if gridsearch and print_model_details:
            print(dtree.best_params_)

        if gridsearch:
            dtree = dtree.best_estimator_

        # Apply the model
        pred_prob_training_y = dtree.predict_proba(train_X)
        pred_prob_test_y = dtree.predict_proba(test_X)
        pred_training_y = dtree.predict(train_X)
        pred_test_y = dtree.predict(test_X)
        frame_prob_training_y = pd.DataFrame(
            pred_prob_training_y, columns=dtree.classes_
        )
        frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=dtree.classes_)

        if print_model_details:
            ordered_indices = [
                i[0]
                for i in sorted(
                    enumerate(dtree.feature_importances_),
                    key=lambda x: x[1],
                    reverse=True,
                )
            ]
            print("Feature importance decision tree:")
            for i in range(0, len(dtree.feature_importances_)):
                print(
                    train_X.columns[ordered_indices[i]],
                )
                print(
                    " & ",
                )
                print(dtree.feature_importances_[ordered_indices[i]])
            tree.export_graphviz(
                dtree,
                out_file=export_tree_path + export_tree_name,
                feature_names=train_X.columns,
                class_names=dtree.classes_,
            )

        return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y

    # Apply a naive bayes approach for classification upon the training data
    # and use the created model to predict the outcome for both the
    # test and training set. It returns the categorical predictions for the training and test set as well as the
    # probabilities associated with each class, each class being represented as a column in the data frame.
    def naive_bayes(self, train_X, train_y, test_X):
        # Create the model
        nb = GaussianNB()

        # Fit the model
        nb.fit(train_X, train_y)

        # Apply the model
        pred_prob_training_y = nb.predict_proba(train_X)
        pred_prob_test_y = nb.predict_proba(test_X)
        pred_training_y = nb.predict(train_X)
        pred_test_y = nb.predict(test_X)
        frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=nb.classes_)
        frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=nb.classes_)

        return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y

    # Apply a random forest approach for classification upon the training data (with the specified value for
    # the minimum samples in the leaf, the number of trees, and if we should print some of the details of the
    # model print_model_details=True) and use the created model to predict the outcome for both the
    # test and training set. It returns the categorical predictions for the training and test set as well as the
    # probabilities associated with each class, each class being represented as a column in the data frame.
    def random_forest(
        self,
        train_X,
        train_y,
        test_X,
        n_estimators=10,
        min_samples_leaf=5,
        criterion="gini",
        print_model_details=False,
        gridsearch=True,
    ):

        if gridsearch:
            tuned_parameters = [
                {
                    "min_samples_leaf": [2, 10, 50, 100, 200],
                    "n_estimators": [10, 50, 100],
                    "criterion": ["gini", "entropy"],
                }
            ]
            rf = GridSearchCV(
                RandomForestClassifier(), tuned_parameters, cv=5, scoring="accuracy"
            )
        else:
            rf = RandomForestClassifier(
                n_estimators=n_estimators,
                min_samples_leaf=min_samples_leaf,
                criterion=criterion,
            )

        # Fit the model

        rf.fit(train_X, train_y.values.ravel())

        if gridsearch and print_model_details:
            print(rf.best_params_)

        if gridsearch:
            rf = rf.best_estimator_

        pred_prob_training_y = rf.predict_proba(train_X)
        pred_prob_test_y = rf.predict_proba(test_X)
        pred_training_y = rf.predict(train_X)
        pred_test_y = rf.predict(test_X)
        frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=rf.classes_)
        frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=rf.classes_)

        if print_model_details:
            ordered_indices = [
                i[0]
                for i in sorted(
                    enumerate(rf.feature_importances_), key=lambda x: x[1], reverse=True
                )
            ]
            print("Feature importance random forest:")
            for i in range(0, len(rf.feature_importances_)):
                print(
                    train_X.columns[ordered_indices[i]],
                )
                print(
                    " & ",
                )
                print(rf.feature_importances_[ordered_indices[i]])

        return (
            pred_training_y,
            pred_test_y,
            frame_prob_training_y,
            frame_prob_test_y,
        )

# Class to abstract a history of numerical values we can use as an attribute.
class NumericalAbstraction:

    # For the slope we need a bit more work.
    # We create time points, assuming discrete time steps with fixed delta t:
    def get_slope(self, data):
        
        times = np.array(range(0, len(data.index)))
        data = data.astype(np.float32)

        # Check for NaN's
        mask = ~np.isnan(data)

        # If we have no data but NaN we return NaN.
        if (len(data[mask]) == 0):
            return np.nan
        # Otherwise we return the slope.
        else:
            slope, _, _, _, _ = stats.linregress(times[mask], data[mask])
            return slope

    #TODO Add your own aggregation function here:
    # def my_aggregation_function(self, data) 

    # This function aggregates a list of values using the specified aggregation
    # function (which can be 'mean', 'max', 'min', 'median', 'std', 'slope')
    def aggregate_value(self,data, window_size, aggregation_function):
        window = str(window_size) + 's'
        # Compute the values and return the result.
        if aggregation_function == 'mean':
            return data.rolling(window, min_periods=window_size).mean()
        elif aggregation_function == 'max':
            return data.rolling(window, min_periods=window_size).max()
        elif aggregation_function == 'min':
            return data.rolling(window, min_periods=window_size).min()
        elif aggregation_function == 'median':
            return data.rolling(window, min_periods=window_size).median()
        elif aggregation_function == 'std':
            return data.rolling(window, min_periods=window_size).std()
        elif aggregation_function == 'slope':
            return data.rolling(window, min_periods=window_size).apply(self.get_slope)
        
        #TODO: add your own aggregation function here
        else:
            return np.nan


    def abstract_numerical(self, data_table, cols, window_size, aggregation_function_name):
    
        for col in cols:
            
            aggregations = self.aggregate_value(data_table[col], window_size, aggregation_function_name)
            data_table[col + '_temp_' + aggregation_function_name + '_ws_' + str(window_size)] = aggregations
      
        
        return data_table

# Not a class, just a bunch of useful functions.

def get_chapter(module_path):
    return re.search('_ch._', 'crowdsignals_ch3_outliers.py').group(0).strip('_')

def normalize_dataset(data_table, columns):
    dt_norm = copy.deepcopy(data_table)
    for col in columns:
        dt_norm[col] = (data_table[col] - data_table[col].mean()) / (data_table[col].max() - data_table[col].min())
    return dt_norm

# Calculate the distance between rows.
def distance(rows, d_function='euclidean'):
    if d_function == 'euclidean':
        # Assumes m rows and n columns (attributes), returns and array where each row represents
        # the distances to the other rows (except the own row).
        return scipy.spatial.distance.pdist(rows, 'euclidean') # todo: replace with numpy?
    else:
        raise ValueError("Unknown distance value '" + d_function + "'")

def print_statistics(dataset, describe=True):

    if describe:
        # .describe() gives number of values, mean, standard deviation, min and max for each column in one table.
        print(dataset.describe().round(3).to_string())
        return

    print('\ncolumn \t\t % missing \t\t mean \t\t standard deviation \t\t min \t\t max')
    dataset_length = len(dataset.index)
    for col in dataset.columns:
        print('\t\t'.join([f'{col}',
                           f'{(dataset_length - dataset[col].count()) / dataset_length * 100:3.1f}%',
                           f'{dataset[col].mean():6.3f}',
                           f'{dataset[col].std():6.3f}',
                           f'{dataset[col].min():6.3f}',
                           f'{dataset[col].max():6.3f}']))

def print_table_cell(value1, value2):
    print("{0:.2f}".format(value1), ' / ', "{0:.2f}".format(value2), end='')

def print_latex_table_statistics_two_datasets(dataset1, dataset2):
    print('attribute, fraction missing values, mean, standard deviation, min, max')
    dataset1_length = len(dataset1.index)
    dataset2_length = len(dataset2.index)
    for col in dataset1.columns:
        print(col, '& ', end='')
        print_table_cell((float((dataset1_length - dataset1[col].count()))/dataset1_length)*100, (float((dataset2_length - dataset2[col].count()))/dataset2_length)*100)
        print(' & ', end='')
        print_table_cell(dataset1[col].mean(), dataset2[col].mean())
        print(' & ', end='')
        print_table_cell(dataset1[col].std(), dataset2[col].std())
        print(' & ', end='')
        print_table_cell(dataset1[col].min(), dataset2[col].min())
        print(' & ', end='')
        print_table_cell(dataset1[col].max(), dataset2[col].max())
        print('\\\\')

def print_latex_statistics_clusters(dataset, cluster_col, input_cols, label_col):
    label_cols = [c for c in dataset.columns if label_col == c[0:len(label_col)]]

    clusters = dataset[cluster_col].unique()

    for c in input_cols:
        print('\multirow{2}{*}{', c, '} & mean ', end='')
        for cluster in clusters:
            print(' & ', "{0:.2f}".format(dataset.loc[dataset[cluster_col] == cluster, c].mean()), end='')
        print('\\\\')
        print(' & std ', end='')
        for cluster in clusters:
            print(' & ', "{0:.2f}".format(dataset.loc[dataset[cluster_col] == cluster, c].std()), end='')
        print('\\\\')

    for l in label_cols:
        print(l, ' & percentage ', end='')
        for cluster in clusters:
            print(' & ', "{0:.2f}".format((float(dataset.loc[dataset[cluster_col] == cluster, l].sum())/len(dataset[dataset[l] == 1].index) * 100)), '\%', end='')
        print('\\\\')

def print_table_row_performances(row_name, training_len, test_len, values):
    scores_over_sd = []
    print(row_name, end='')

    for val in values:
        print(' & ', end='')
        sd_train = math.sqrt((val[0]*(1-val[0]))/training_len)
        print("{0:.4f}".format(val[0]), end='')
        print('\\emph{(', "{0:.4f}".format(val[0]-2*sd_train), '-', "{0:.4f}".format(val[0]+2*sd_train), ')}', ' & ', end='')
        sd_test = math.sqrt((val[1]*(1-val[1]))/test_len)
        print("{0:.4f}".format(val[1]), end='')
        print('\\emph{(', "{0:.4f}".format(val[1]-2*sd_test), '-', "{0:.4f}".format(val[1]+2*sd_test), ')}', end='')
        scores_over_sd.append([val[0], sd_train, val[1], sd_test])
    print('\\\\\\hline')
    return scores_over_sd

def print_table_row_performances_regression(row_name, training_len, test_len, values):
    print(row_name),

    for val in values:
        print(' & ', end='')
        print("{0:.4f}".format(val[0]), end='')
        print('\\emph{(', "{0:.4f}".format(val[1]), ')}', ' & ', end='')
        print("{0:.4f}".format(val[2]), end='')
        print('\\emph{(', "{0:.4f}".format(val[3]), ')}', end='')
    print('\\\\\\hline')

def print_pearson_correlations(correlations):
    for i in range(0, len(correlations)):
        if np.isfinite(correlations[i][1]):
            print(correlations[i][0], ' & ', "{0:.4f}".format(correlations[i][1]), '\\\\\\hline')

# This class removes the high frequency data (that might be considered noise) from the data.
class LowPassFilter:

    def low_pass_filter(self, data_table, col, sampling_frequency, cutoff_frequency, order=5, phase_shift=True):
        # http://stackoverflow.com/questions/12093594/how-to-implement-band-pass-butterworth-filter-with-scipy-signal-butter
        # Cutoff frequencies are expressed as the fraction of the Nyquist frequency, which is half the sampling frequency
        nyq = 0.5 * sampling_frequency
        cut = cutoff_frequency / nyq

        b, a = butter(order, cut, btype='low', output='ba', analog=False)
        if phase_shift:
            data_table[col + '_lowpass'] = filtfilt(b, a, data_table[col])
        else:
            data_table[col + '_lowpass'] = lfilter(b, a, data_table[col])
        return data_table

# Class for Principal Component Analysis. We can only apply this when we do not have missing values (i.e. NaN).
# For this we have to impute these first, be aware of this.
class PrincipalComponentAnalysis:

    pca = []

    def __init__(self):
        self.pca = []
    # Perform the PCA on the selected columns and return the explained variance.
    def determine_pc_explained_variance(self, data_table, cols):
        # Normalize the data first.
        dt_norm = normalize_dataset(data_table, cols)

        # perform the PCA.
        self.pca = PCA(n_components = len(cols))
        self.pca.fit(dt_norm[cols])
        # And return the explained variances.
        return self.pca.explained_variance_ratio_

    # Apply a PCA given the number of components we have selected.
    # We add new pca columns.
    def apply_pca(self, data_table, cols, number_comp):
        # Normalize the data first.
        dt_norm = normalize_dataset(data_table, cols)

        # perform the PCA.
        self.pca = PCA(n_components = number_comp)
        self.pca.fit(dt_norm[cols])

        # Transform our old values.
        new_values = self.pca.transform(dt_norm[cols])

        #And add the new ones:
        for comp in range(0, number_comp):
            data_table['pca_' +str(comp+1)] = new_values[:,comp]

        return data_table




def plot_binary_outliers(dataset, col, outlier_col, reset_index):
    """ Plot outliers in case of a binary outlier score. Here, the col specifies the real data
    column and outlier_col the columns with a binary value (outlier or not).

    Args:
        dataset (pd.DataFrame): The dataset
        col (string): Column that you want to plot
        outlier_col (string): Outlier column marked with true/false
        reset_index (bool): whether to reset the index for plotting
    """

    # Taken from: https://github.com/mhoogen/ML4QS/blob/master/Python3Code/util/VisualizeDataset.py

    dataset = dataset.dropna(axis=0, subset=[col, outlier_col])
    dataset[outlier_col] = dataset[outlier_col].astype("bool")

    if reset_index:
        dataset = dataset.reset_index()

    fig, ax = plt.subplots()

    plt.xlabel("samples")
    plt.ylabel("value")

    # Plot non outliers in default color
    ax.plot(
        dataset.index[~dataset[outlier_col]],
        dataset[col][~dataset[outlier_col]],
        "+",
    )
    # Plot data points that are outliers in red
    ax.plot(
        dataset.index[dataset[outlier_col]],
        dataset[col][dataset[outlier_col]],
        "r+",
    )

    plt.legend(
        ["outlier " + col, "no outlier " + col],
        loc="upper center",
        ncol=2,
        fancybox=True,
        shadow=True,
    )
    plt.show()


def mark_outliers_iqr(dataset, col):
    """Function to mark values as outliers using the IQR method.

    Args:
        dataset (pd.DataFrame): The dataset
        col (string): The column you want apply outlier detection to

    Returns:
        pd.DataFrame: The original dataframe with an extra boolean column 
        indicating whether the value is an outlier or not.
    """

    dataset = dataset.copy()

    Q1 = dataset[col].quantile(0.25)
    Q3 = dataset[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    dataset[col + "_outlier"] = (dataset[col] < lower_bound) | (
        dataset[col] > upper_bound
    )

    return dataset


def mark_outliers_chauvenet(dataset, col, C=2):
    """Finds outliers in the specified column of datatable and adds a binary column with
    the same name extended with '_outlier' that expresses the result per data point.
    
    Taken from: https://github.com/mhoogen/ML4QS/blob/master/Python3Code/Chapter3/OutlierDetection.py

    Args:
        dataset (pd.DataFrame): The dataset
        col (string): The column you want apply outlier detection to
        C (int, optional): Degree of certainty for the identification of outliers given the assumption 
                           of a normal distribution, typicaly between 1 - 10. Defaults to 2.

    Returns:
        pd.DataFrame: The original dataframe with an extra boolean column 
        indicating whether the value is an outlier or not.
    """

    dataset = dataset.copy()
    # Compute the mean and standard deviation.
    mean = dataset[col].mean()
    std = dataset[col].std()
    N = len(dataset.index)
    criterion = 1.0 / (C * N)

    # Consider the deviation for the data points.
    deviation = abs(dataset[col] - mean) / std

    # Express the upper and lower bounds.
    low = -deviation / math.sqrt(C)
    high = deviation / math.sqrt(C)
    prob = []
    mask = []

    # Pass all rows in the dataset.
    for i in range(0, len(dataset.index)):
        # Determine the probability of observing the point
        prob.append(
            1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
        )
        # And mark as an outlier when the probability is below our criterion.
        mask.append(prob[i] < criterion)
    dataset[col + "_outlier"] = mask
    return dataset


def mark_outliers_lof(dataset, columns, n=20):
    """Mark values as outliers using LOF

    Args:
        dataset (pd.DataFrame): The dataset
        col (string): The column you want apply outlier detection to
        n (int, optional): n_neighbors. Defaults to 20.
    
    Returns:
        pd.DataFrame: The original dataframe with an extra boolean column
        indicating whether the value is an outlier or not.
    """
    
    dataset = dataset.copy()

    lof = LocalOutlierFactor(n_neighbors=n)
    data = dataset[columns]
    outliers = lof.fit_predict(data)
    X_scores = lof.negative_outlier_factor_

    dataset["outlier_lof"] = outliers == -1
    return dataset, outliers, X_scores


# Load Data

In [None]:
df=pd.read_pickle("../data/interim/01_Data_Processed.pkl")
df.info()

# EDA

In [None]:
set_df=df[df["Set"]==1]
plt.plot(set_df["Accelerometer_y"])

In [None]:
plt.plot(df["Accelerometer_y"])

In [None]:
plt.plot(set_df["Accelerometer_y"].reset_index(drop=True))

In [None]:
df_label_unique=df["Label"].unique()
df_label_unique

In [None]:
for label in df_label_unique:
    subset = df[df["Label"]==label]
    plt.plot(subset["Accelerometer_y"].reset_index(drop=True),label=label)
    plt.title(label=label)

    plt.show()

In [None]:
for label in df_label_unique:
    subset = df[df["Label"]==label]
    plt.title(label=label)
    plt.plot(subset[:100]["Accelerometer_y"].reset_index(drop=True),label=label)
    plt.show()

In [None]:
catogory_df=df.query("Label=='squat'").query("Participants=='A'").reset_index()

fig,ax=plt.subplots()
catogory_df.groupby(["Category"])["Accelerometer_y"].plot()
ax.set_ylabel("Accelerometer")
ax.set_xlabel("samples")
plt.legend()


In [None]:
Participants_df=df.query("Label=='bench'").sort_values("Participants").reset_index()

fig,ax=plt.subplots()
Participants_df.groupby(["Participants"])["Accelerometer_y"].plot()
ax.set_ylabel("Accelerometer")
ax.set_xlabel("samples")
plt.legend()


In [None]:
labels=df["Label"].unique()
participants=df["Participants"].unique()

for label in labels:
    for participant in participants:
        all_axis_df=df.query(f"Label=='{label}'").query(f"Participants=='{participant}'").reset_index()


        if len(all_axis_df)!=0:
            fig,ax=plt.subplots()
            all_axis_df[["Accelerometer_x","Accelerometer_y","Accelerometer_z"]].plot(ax=ax)
            ax.set_ylabel("Accelerometer")
            ax.set_xlabel("samples")
            plt.title(f"{label}({participant})".title())
            plt.legend()

In [None]:
labels=df["Label"].unique()
participants=df["Participants"].unique()

for label in labels:
    for participant in participants:
        all_axis_df=df.query(f"Label=='{label}'").query(f"Participants=='{participant}'").reset_index()


        if len(all_axis_df)!=0:
            fig,ax=plt.subplots()
            all_axis_df[["Gyroscope_x","Gyroscope_y","Gyroscope_z"]].plot(ax=ax)
            ax.set_ylabel("Gyroscope")
            ax.set_xlabel("samples")
            plt.title(f"{label}({participant})".title())
            plt.legend()

In [None]:
labels=df["Label"].unique()
participants=df["Participants"].unique()

for label in labels:
    for participant in participants:
        com_plot_df=df.query(f"Label=='{label}'").query(f"Participants=='{participant}'").reset_index()


        if len(com_plot_df)!=0:

            fig,ax=plt.subplots(nrows=2,sharex=True)
            com_plot_df[["Accelerometer_x","Accelerometer_y","Accelerometer_z"]].plot(ax=ax[0])
            com_plot_df[["Gyroscope_x","Gyroscope_y","Gyroscope_z"]].plot(ax=ax[1])
            ax[1].set_xlabel("samples")
            ax[0].legend()
            ax[1].legend()    
            plt.title(f"{label.title()}({participant})")
            plt.show()

# Remove Outliers

In [None]:
outlier_col=list(df.columns[:6])
df[outlier_col[:3]+["Label"]].boxplot(by="Label",figsize=(20,10),layout=(1,3))
df[outlier_col[3:6]+["Label"]].boxplot(by="Label",figsize=(20,10),layout=(1,3))

In [None]:
for col in outlier_col:
    dataset=mark_outliers_iqr(df,col)
    plot_binary_outliers(dataset,col,col+"_outlier",True)

In [None]:
df[outlier_col[:3]+["Label"]].plot.hist(by="Label",figsize=(20,10),layout=(3,3))
df[outlier_col[3:6]+["Label"]].plot.hist(by="Label",figsize=(20,10),layout=(3,3))

In [None]:
for col in outlier_col:
    dataset=mark_outliers_chauvenet(df,col)
    plot_binary_outliers(dataset,col,col+"_outlier",True)

In [None]:
dataset,out,x=mark_outliers_lof(df,outlier_col)

for col in outlier_col:
    plot_binary_outliers(dataset,col,"outlier_lof",True)

In [None]:
label="bench"
for col in outlier_col:
    dataset=mark_outliers_iqr(df[df["Label"]==label],col)
    plot_binary_outliers(dataset,col,col+"_outlier",True)

In [None]:
label="bench"
for col in outlier_col:
    dataset=mark_outliers_chauvenet(df[df["Label"]==label],col)
    plot_binary_outliers(dataset,col,col+"_outlier",True)

In [None]:
dataset,out,x=mark_outliers_lof(df[df["Label"]==label],outlier_col)

for col in outlier_col:
    plot_binary_outliers(dataset,col,"outlier_lof",True)

In [None]:
outlier_removed_df=df.copy()
for col in outlier_col:
    for label in df["Label"].unique():
        dataset=mark_outliers_chauvenet(df[df["Label"]==label],col)
        dataset.loc[dataset[col + "_outlier"],col]= np.nan
        outlier_removed_df.loc[(outlier_removed_df["Label"]==label),col]=dataset[col]
        n_outliers=len(df) - len(outlier_removed_df[col].dropna())
        print(f"Removed {n_outliers} from {col} for {label}")

In [None]:
outlier_removed_df.info()

In [None]:
df=outlier_removed_df
df.info()

# Feature Engineering

In [None]:
pridictor_col=list(df.columns[:6])
for col in pridictor_col:
    df[col] = df[col].interpolate()

df.info()

In [None]:
duration=df[df["Set"]==1].index[-1] - df[df["Set"]==1].index[0]
duration.seconds

In [None]:
for s in df["Set"].unique():
    duration=df[df["Set"]==s].index[-1] - df[df["Set"]==s].index[0]
    df.loc[(df["Set"]==s),"Duration"]=duration.seconds

df

In [None]:
duration_df=df.groupby(["Category"])["Duration"].mean()
duration_df

In [None]:
df_lowpass=df.copy()

LowPass=LowPassFilter()
fs=1000/200
cutoff=1.2


df_lowpass=LowPass.low_pass_filter(df_lowpass,"Accelerometer_y",fs,cutoff,order=5)
subset=df_lowpass[df_lowpass["Set"]==45]
fig,ax=plt.subplots(nrows=2,sharex=True,figsize=(20,10))
ax[0].plot(subset["Accelerometer_y"].reset_index(drop=True),label="raw data")
ax[1].plot(subset["Accelerometer_y_lowpass"].reset_index(drop=True),label="butterworth data") 
ax[0].legend(loc="upper center",bbox_to_anchor=(0.5,1.15),fancybox=True,shadow=True)
ax[1].legend(loc="upper center",bbox_to_anchor=(0.5,1.15),fancybox=True,shadow=True)  


# Make smooth graph at all col
for col in pridictor_col:
    df_lowpass=LowPass.low_pass_filter(df_lowpass,col,fs,cutoff)
    df_lowpass[col]=df_lowpass[col+"_lowpass"]
    del df_lowpass[col+"_lowpass"]

df_lowpass

In [None]:
df_pca=df_lowpass.copy()
PCA1=PrincipalComponentAnalysis()
pc_values1=PCA1.determine_pc_explained_variance(df_pca,pridictor_col)
df_pca=PCA1.apply_pca(df_pca,pridictor_col,3)
df_pca

In [None]:
subset=df_pca[df_pca["Set"]==35]
subset[["pca_1","pca_2","pca_3"]].plot()

In [None]:
df_squared=df_pca.copy()
acc_r=df_squared["Accelerometer_x"]**2+df_squared["Accelerometer_y"]**2+df_squared["Accelerometer_z"]**2
gyro_r=df_squared["Gyroscope_x"]**2+df_squared["Gyroscope_y"]**2+df_squared["Gyroscope_z"]**2
df_squared["Accelerometer_r"]=np.sqrt(acc_r)
df_squared["Gyroscope_r"]=np.sqrt(gyro_r)
df_squared=df_squared.drop(columns=["Duration"])
df_squared

In [None]:
subset=df_squared[df_squared["Set"]==18]
subset

In [None]:
subset=df_squared[df_squared["Set"]==14]
subset[["Accelerometer_r","Gyroscope_r"]].plot()
subset[["Accelerometer_r","Gyroscope_r"]].plot(subplots=True)

In [None]:
df_temporal=df_squared.copy()
df_temporal

In [None]:
df_temporal=df_squared.copy()
NumAbs=NumericalAbstraction()

pridictor_col=pridictor_col+["Accelerometer_r","Gyroscope_r"]

ws=int(1000/200)
for col in pridictor_col:
    df_temporal=NumAbs.abstract_numerical(df_temporal,[col],ws,"mean")
    df_temporal=NumAbs.abstract_numerical(df_temporal,[col],ws,"std")
df_temporal_list=[]
for s in df_temporal["Set"].unique():
    subset=df_temporal[df_temporal["Set"]==s].copy()
    for col in pridictor_col:
        subset=NumAbs.abstract_numerical(subset,[col],ws,"mean")
        subset=NumAbs.abstract_numerical(subset,[col],ws,"std")
    df_temporal_list.append(subset)

df_temporal=pd.concat(df_temporal_list)

df_temporal.info()


In [None]:
subset[["Gyroscope_y","Gyroscope_y_temp_mean_ws_5","Gyroscope_y_temp_std_ws_5"]].plot()
subset[["Accelerometer_y","Accelerometer_y_temp_mean_ws_5","Accelerometer_y_temp_std_ws_5"]].plot()

In [None]:
df_frq=df_temporal.copy().reset_index()
df_frq

In [None]:
FrqAbs=FourierTransformation()

fs= int(1000/200)
ws=int(2800/200)


df_frq_list=[]
for s in df_frq["Set"].unique():
    subset=df_frq[df_frq["Set"]==s].reset_index(drop=True).copy()
    subset=FrqAbs.abstract_frequency(subset,pridictor_col,ws,fs)
    df_frq_list.append(subset)

df_frq=pd.concat(df_frq_list).set_index("epoch (ms)",drop=True)
df_frq

In [None]:
df_frq=df_frq.dropna()
df_frq=df_frq.iloc[::2]

In [None]:
df_cluster=df_frq.copy()
cluster_col=["Accelerometer_x","Accelerometer_y","Accelerometer_z"]
k_values=range(2,10)
inertias=[]

for k in k_values:
    subset=df_cluster[cluster_col]
    kmeans=KMeans(n_clusters=k,n_init=20,random_state=0)
    cluster_label=kmeans.fit_predict(subset)
    inertias.append(kmeans.inertia_)



In [None]:
plt.figure(figsize=(10,10))
plt.plot(k_values,inertias)

In [None]:
kmeans=KMeans(n_clusters=5,n_init=20,random_state=0)
subset=df_cluster[cluster_col]
df_cluster["Cluster"]=kmeans.fit_predict(subset)

In [None]:
fig=plt.figure(figsize=(15,15))
ax=fig.add_subplot(projection="3d")
for c in df_cluster["Cluster"].unique():
    subset=df_cluster[df_cluster["Cluster"]==c]
    ax.scatter(subset["Accelerometer_x"],subset["Accelerometer_y"],subset["Accelerometer_z"],label=c)
plt.legend()
plt.show()

In [None]:
fig=plt.figure(figsize=(15,15))
ax=fig.add_subplot(projection="3d")
for i in df_cluster["Label"].unique():
    subset=df_cluster[df_cluster["Label"]==i]
    ax.scatter(subset["Accelerometer_x"],subset["Accelerometer_y"],subset["Accelerometer_z"],label=c)
plt.legend()
plt.show()

In [None]:
df=df_cluster
df.info()

In [None]:
df_train=df.drop(["Participants","Category","Set"],axis=1)
df_train

In [None]:
X=df_train.drop("Label",axis=1)
y=df_train["Label"]

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.25,stratify=y)
X_test


In [None]:
basic_col=["Accelerometer_x","Accelerometer_y","Accelerometer_z","Gyroscope_x","Gyroscope_y","Gyroscope_z"]
square_col=["Accelerometer_r","Gyroscope_r"]
pca_col=["pca_1","pca_2","pca_3"]
time_col=[f for f in df_train.columns if "_temp_" in f]
freq_col=[f for f in df_train.columns if ("_freq" in f) or ("_pse" in f)]
cluster_col=["Cluster"]
print(len(basic_col))
print(len(square_col))
print(len(pca_col))
print(len(time_col))
print(len(freq_col))
print(len(cluster_col))

In [None]:
df_train.columns[30:]

In [None]:
f_set_1=list(set(basic_col))
f_set_2=list(set(basic_col+square_col+pca_col))
f_set_3=list(set(f_set_2+time_col))
f_set_4=list(set(f_set_3+freq_col+cluster_col))

In [None]:
learner=ClassificationAlgorithms()
selected_col,ordered_col,orederd_scores=learner.forward_selection(10,X_train,y_train)

In [None]:
orederd_scores

In [None]:
selected_col

In [None]:

iterations=1
score_df=pd.DataFrame()
possible_feature_sets=[
    f_set_1,f_set_2,f_set_3,f_set_4,selected_col
]
feature_names=[
    "f_col_1",
    "f_col_2",
    "f_col_3",
    "f_col_4",
    "selected_col",
]
for i, f in zip(range(len(possible_feature_sets)), feature_names):
    print("Feature set:", i)
    selected_train_X = X_train[possible_feature_sets[i]]
    selected_test_X = X_test[possible_feature_sets[i]]

    # First run non deterministic classifiers to average their score.
    performance_test_nn = 0
    performance_test_rf = 0

    for it in range(0, iterations):
        print("\tTraining neural network,", it)
        (
            class_train_y,
            class_test_y,
            class_train_prob_y,
            class_test_prob_y,
        ) = learner.feedforward_neural_network(
            selected_train_X,
            y_train,
            selected_test_X,
            gridsearch=False,
        )
        performance_test_nn += accuracy_score(y_test, class_test_y)

        print("\tTraining random forest,", it)
        (
            class_train_y,
            class_test_y,
            class_train_prob_y,
            class_test_prob_y,
        ) = learner.random_forest(
            selected_train_X, y_train, selected_test_X, gridsearch=True
        )
        performance_test_rf += accuracy_score(y_test, class_test_y)

    performance_test_nn = performance_test_nn / iterations
    performance_test_rf = performance_test_rf / iterations

    # And we run our deterministic classifiers:
    print("\tTraining KNN")
    (
        class_train_y,
        class_test_y,
        class_train_prob_y,
        class_test_prob_y,
    ) = learner.k_nearest_neighbor(
        selected_train_X, y_train, selected_test_X, gridsearch=True
    )
    performance_test_knn = accuracy_score(y_test, class_test_y)

    print("\tTraining decision tree")
    (
        class_train_y,
        class_test_y,
        class_train_prob_y,
        class_test_prob_y,
    ) = learner.decision_tree(
        selected_train_X, y_train, selected_test_X, gridsearch=True
    )
    performance_test_dt = accuracy_score(y_test, class_test_y)

    print("\tTraining naive bayes")
    (
        class_train_y,
        class_test_y,
        class_train_prob_y,
        class_test_prob_y,
    ) = learner.naive_bayes(selected_train_X, y_train, selected_test_X)

    performance_test_nb = accuracy_score(y_test, class_test_y)

    # Save results to dataframe
    models = ["NN", "RF", "KNN", "DT", "NB"]
    new_scores = pd.DataFrame(
        {
            "model": models,
            "feature_set": f,
            "accuracy": [
                performance_test_nn,
                performance_test_rf,
                performance_test_knn,
                performance_test_dt,
                performance_test_nb,
            ],
        }
    )
    score_df = pd.concat([score_df, new_scores])


In [None]:
score_list=score_df.sort_values(by="accuracy",ascending=False).head()
print(score_list)

In [None]:
class_train_y,class_test_y,class_train_prob_y,class_test_prob_y=learner.random_forest(
    X_train[f_set_4],y_train,X_test[f_set_4],gridsearch=True
)

In [None]:
accuracy=accuracy_score(y_test,class_test_y)
accuracy

In [None]:
classes=class_test_prob_y.columns
cm=confusion_matrix(y_test,class_test_y,labels=classes)

In [None]:
plt.figure(figsize=(10, 10))
plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
plt.title("Confusion matrix")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(
        j,
        i,
        format(cm[i, j]),
        horizontalalignment="center",
        color="white" if cm[i, j] > thresh else "black",
    )
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.grid(False)
plt.show()


In [None]:
participant_df=df.drop(["Set","Category"],axis=1)
X_train=participant_df[participant_df["Participants"] !="A"].drop(["Label"],axis=1)
y_train=participant_df[participant_df["Participants"] !="A"]["Label"]

X_test=participant_df[participant_df["Participants"] !="A"].drop(["Label"],axis=1)
y_test=participant_df[participant_df["Participants"] !="A"]["Label"]

X_train=X_train.drop(["Participants"],axis=1)
X_test=X_test.drop(["Participants"],axis=1)


class_train_y,class_test_y,class_train_prob_y,class_test_prob_y=learner.random_forest(
    X_train[f_set_4],y_train,X_test[f_set_4],gridsearch=True
)

classes=class_test_prob_y.columns
cm=confusion_matrix(y_test,class_test_y,labels=classes)

plt.figure(figsize=(10, 10))
plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
plt.title("Confusion matrix")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(
        j,
        i,
        format(cm[i, j]),
        horizontalalignment="center",
        color="white" if cm[i, j] > thresh else "black",
    )
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.grid(False)
plt.show()

In [None]:
accuracy=accuracy_score(y_test,class_test_y)
accuracy

In [None]:

class_train_y,class_test_y,class_train_prob_y,class_test_prob_y=learner.feedforward_neural_network(
    X_train[selected_col],y_train,X_test[selected_col],gridsearch=False
)

classes=class_test_prob_y.columns
cm=confusion_matrix(y_test,class_test_y,labels=classes)

plt.figure(figsize=(10, 10))
plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
plt.title("Confusion matrix")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(
        j,
        i,
        format(cm[i, j]),
        horizontalalignment="center",
        color="white" if cm[i, j] > thresh else "black",
    )
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.grid(False)
plt.show()

In [None]:
accuracy=accuracy_score(y_test,class_test_y)
accuracy

# Counting

In [None]:
df=pd.read_pickle("../data/interim/01_Data_Processed.pkl")
df

In [None]:
df=df[df["Label"]!="rest"]
acc_r=df["Accelerometer_x"]**2+df["Accelerometer_y"]**2+df["Accelerometer_z"]**2
gyro_r=df["Gyroscope_x"]**2+df["Gyroscope_y"]**2+df["Gyroscope_z"]**2
df["Accelerometer_r"]=np.sqrt(acc_r)
df["Gyroscope_r"]=np.sqrt(gyro_r)


In [None]:
bench_df=df[df["Label"]=="bench"]
squat_df=df[df["Label"]=="squat"]
row_df=df[df["Label"]=="row"]
ohp_df=df[df["Label"]=="ohp"]
dead_df=df[df["Label"]=="dead"]


In [None]:
fs=1000/200
LowPass=LowPassFilter()

bench_set=bench_df[bench_df["Set"]==bench_df["Set"].unique()[0]]
squat_set=squat_df[squat_df["Set"]==squat_df["Set"].unique()[0]]
row_set=row_df[row_df["Set"]==row_df["Set"].unique()[0]]
ohp_set=ohp_df[ohp_df["Set"]==ohp_df["Set"].unique()[0]]
dead_set=dead_df[dead_df["Set"]==dead_df["Set"].unique()[0]]

In [None]:
column="Accelerometer_y"
LowPass.low_pass_filter(bench_set,column,fs,0.4,10)[column+"_lowpass"].plot()

In [None]:
count(bench_set,cutoff=0.4)
count(squat_set,cutoff=0.35)
count(row_set,cutoff=0.65,column="Gyroscope_x")
count(ohp_set,cutoff=0.35)
count(dead_set,cutoff=0.4)

In [None]:
df["Reps"]=df["Category"].apply(lambda x:5 if x == "heavy" else 10)
rep_df=df.groupby(["Label","Category","Set"])["Reps"].max().reset_index()
rep_df["Reps_pred"]=0

for s in df["Set"].unique():
    subset=df[df["Set"]==s]
    column="Accelerometer_r"
    cutoff=0.4
    if subset["Label"].iloc[0] =="squat":
        cutoff=0.35
    if subset["Label"].iloc[0] =="row":
        cutoff=0.65
    if subset["Label"].iloc[0] =="ohp":
        cutoff=0.35

    reps=count(subset,cutoff,10,column)
    rep_df.loc[rep_df["Set"]==s,"Reps_pred"]=reps

In [None]:
rep_df

In [None]:
error=mean_absolute_error(rep_df["Reps"],rep_df["Reps_pred"]).round(2)
print(error)
rep_df.groupby(["Label","Category"])[["Reps","Reps_pred"]].mean().plot.bar()