# Retrieve all the relevant data from the table athletic-data in the database.
# Pre-requisite :
  1. Activity has to be of more than 2hrs.
  2. Watts and HeartRate column should not be null.

In [1]:
from src.repositories.PowerAndHRRepository import *

athletic_data_db = get_athletic_data()

You are successfully connected to your Database!


# DATA PROFILING : Removing irrelevant columns not required for analysis.

In [2]:
cols_to_ignore = configs.get("ignore-columns").data
athletic_data = athletic_data_db.drop(cols_to_ignore.split(","), axis=1)

# NA/NONE CHECK : Checking if watts or heartrate column contains null values

In [3]:
import numpy as np

hr_contains_none = athletic_data['heartrate'].apply(lambda x: None in x or np.nan in x or x is None).sum()
print(f"{hr_contains_none}/{len(athletic_data['heartrate'])} entries contains NA/NONE/null in heartrate")
watts_contains_none = athletic_data['watts'].apply(lambda x: None in x or np.nan in x or x is None).sum()
print(f"{watts_contains_none}/{len(athletic_data['watts'])}  entries contains NA/NONE/null in watts")

0/164 entries contains NA/NONE/null in heartrate
147/164  entries contains NA/NONE/null in watts


# NON-LINEAR/POLYNOMIAL REGRESSION For Missing Data :
1. Power output/watts has similar variations as heartrate and heartrate has no missing entries(so model can rely on heartrate as the attribute to understand variations for watts and fill missing values.
2. Power Output changes gradually on change of heart rate, not instantaneously.

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

def flatten_list(lst):
    flattened = []
    for item in lst:
        if isinstance(item, (list, tuple)):
            flattened.extend(item)
        else:
            flattened.append(item)
    return flattened

def convert_arrays_to_lists(arr):
    converted = []
    for item in arr:
        if isinstance(item, np.ndarray):
            converted.append(item.tolist())
        elif isinstance(item, (list, tuple)):
            converted.append(convert_arrays_to_lists(item))
        else:
            converted.append(item)
    return converted

def fill_none_with_regression(df, degree=2):
    df_copy = df.copy()

    X_train = []
    y_train = []

    # Prepare the training data
    for idx, row in df_copy.iterrows():
        heartrate = row['heartrate']
        watts = row['watts']

        mask = [w is not None for w in watts]  # Create a mask for non-None values
        watts_valid = [w for w, m in zip(watts, mask) if m]
        heartrate_valid = [h for h, m in zip(heartrate, mask) if m]

        X_train.extend([[x] for x in heartrate_valid])
        y_train.extend([[y] for y in watts_valid])

    # Perform polynomial regression
    poly_features = PolynomialFeatures(degree=degree)
    X_poly = poly_features.fit_transform(X_train)

    regressor = LinearRegression()
    regressor.fit(X_poly, y_train)

    # Fill the missing values
    for idx, row in df.iterrows():
        heartrate = row['heartrate']
        watts = row['watts']

        mask = [w is None or w == 0 for w in watts]  # Create a mask for None and 0 values

        if any(mask):
            heartrate_fill = [h for h, m in zip(heartrate, mask) if m]
            X_test = poly_features.transform([[x] for x in heartrate_fill])
            watts_fill = regressor.predict(X_test)

            # Replace the None and 0 values with the predicted values
            for i, value in enumerate(watts_fill):
                if watts[i] is None or watts[i] == 0:
                    watts[i] = value

            watts_flat = flatten_list(watts)
            watts_flat = convert_arrays_to_lists(watts_flat)
            df.at[idx, 'watts'] = watts_flat

    return df


ad_filled = fill_none_with_regression(athletic_data, degree=2)
print(ad_filled)


     activity_id                                          heartrate  \
0     8681129847  [83, 85, 86, 85, 85, 85, 85, 84, 83, 82, 80, 7...   
1     9039302926  [103, 104, 104, 104, 103, 103, 104, 104, 104, ...   
2     8807954766  [109, 109, 108, 108, 108, 107, 106, 107, 107, ...   
3     8702957363  [83, 82, 82, 82, 78, 76, 74, 72, 74, 76, 77, 8...   
4     8847871051  [111, 110, 109, 108, 108, 108, 108, 108, 106, ...   
..           ...                                                ...   
159   9095632721  [58, 61, 62, 65, 66, 68, 69, 72, 73, 74, 75, 7...   
160   9145237366  [68, 70, 73, 74, 75, 76, 76, 76, 76, 76, 76, 7...   
161   8718709606  [83, 84, 85, 87, 87, 90, 92, 93, 95, 96, 98, 9...   
162   8816718510  [97, 99, 101, 100, 99, 98, 98, 98, 98, 97, 98,...   
163   8757238016  [88, 89, 91, 92, 94, 95, 92, 91, 93, 94, 94, 9...   

                                                 watts  
0    [21, 21, 21, 36, 36, 6, [117.03046332262377], ...  
1    [229, 232, 226, 223, 223, 20

# NOISE REDUCTION : Using Kalman Filtering Technique

In [None]:
from pykalman import KalmanFilter

def reduce_noise_kalman(data):
    kf = KalmanFilter(transition_matrices=[1],
                      observation_matrices=[1],
                      initial_state_mean=data[0],
                      initial_state_covariance=1,
                      observation_covariance=1,
                      transition_covariance=0.1)

    filtered_state_means, filtered_state_covariances = kf.filter(data)

    return filtered_state_means

athletic_data['heartrate'] = athletic_data['heartrate'].apply(reduce_noise_kalman)
athletic_data['watts'] = athletic_data['watts'].apply(reduce_noise_kalman)


# DATA DOWNSAMPLING : A data aggregation procedure where we decreasing the time-frequency of the data to plot on graph and visualise easily with less variations and minimizing abundance of values to process, making machine learning algorithms work faster.

In [None]:

# Function to downsample a column with None values
def downsample_column(arr, factor):
    downsampled_arr = []
    for i in range(0, len(arr), factor):
        segment = arr[i:i+factor]  # Extract a segment of the array based on the downsampling factor
        non_none_values = [value for value in segment if value is not None]  # Filter out None values
        if non_none_values:
            downsampled_arr.append(sum(non_none_values) / len(non_none_values))  # Calculate the average of non-None values
        else:
            downsampled_arr.append(None)  # Assign None if all values in the segment are None
    return downsampled_arr


athletic_data['heartrate'] = athletic_data['heartrate'].apply(lambda arr: downsample_column(arr,200))
athletic_data['watts'] = athletic_data['watts'].apply(lambda arr: downsample_column(arr,200))