# Retrieve all the relevant data from the table athletic-data in the database.
# Pre-requisite :
  1. Activity has to be of more than 2hrs.
  2. Watts and HeartRate column should not be null.

In [2]:
from src.repositories.PowerAndHRRepository import *

athletic_data_db = get_athletic_data()

You are successfully connected to your Database!


# DATA PROFILING : Removing irrelevant columns not required for analysis.

In [3]:
cols_to_ignore = configs.get("ignore-columns").data
athletic_data = athletic_data_db.drop(cols_to_ignore.split(","), axis=1)

# NA/NONE CHECK : Checking if watts or heartrate column contains null values

In [10]:
import numpy as np

hr_contains_none = athletic_data['heartrate'].apply(lambda x: None in x or np.nan in x or x is None).sum()
print(f"{hr_contains_none}/{len(athletic_data['heartrate'])} entries contains NA/NONE/null in heartrate")
watts_contains_none = athletic_data['watts'].apply(lambda x: None in x or np.nan in x or x is None).sum()
print(f"{watts_contains_none}/{len(athletic_data['watts'])}  entries contains NA/NONE/null in watts")

0/164 entries contains NA/NONE/null in heartrate
147/164  entries contains NA/NONE/null in watts


#NOISE REDUCTION : Using Kalman Filtering Technique

In [None]:
from pykalman import KalmanFilter

def reduce_noise_kalman(data):
    kf = KalmanFilter(transition_matrices=[1],
                      observation_matrices=[1],
                      initial_state_mean=data[0],
                      initial_state_covariance=1,
                      observation_covariance=1,
                      transition_covariance=0.1)

    filtered_state_means, filtered_state_covariances = kf.filter(data)

    return filtered_state_means

athletic_data['heartrate'] = athletic_data['heartrate'].apply(reduce_noise_kalman)
athletic_data['watts'] = athletic_data['watts'].apply(reduce_noise_kalman)


# DATA DOWNSAMPLING : A data aggregation procedure where we decreasing the time-frequency of the data to plot on graph and visualise easily with less variations and minimizing abundance of values to process, making machine learning algorithms work faster.

In [3]:

# Function to downsample a column with None values
def downsample_column(arr, factor):
    downsampled_arr = []
    for i in range(0, len(arr), factor):
        segment = arr[i:i+factor]  # Extract a segment of the array based on the downsampling factor
        non_none_values = [value for value in segment if value is not None]  # Filter out None values
        if non_none_values:
            downsampled_arr.append(sum(non_none_values) / len(non_none_values))  # Calculate the average of non-None values
        else:
            downsampled_arr.append(None)  # Assign None if all values in the segment are None
    return downsampled_arr


athletic_data['heartrate'] = athletic_data['heartrate'].apply(lambda arr: downsample_column(arr,200))
athletic_data['watts'] = athletic_data['watts'].apply(lambda arr: downsample_column(arr,200))