In [1]:
# -- IMPORTS START --
import pandas as pd
import glob
import re
import os
import sys
import pickle
import datetime
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn import tree, metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from scipy.signal import butter, filtfilt, find_peaks
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.model_selection import train_test_split
# -- IMPORTS END --

# enable zooming into graphs
%matplotlib inline
plt.rcParams['figure.figsize'] = [9, 6] # width, height in inches

In [2]:
#Do not modify
def calc_magnitude(data):

    # Calculate magnitude
    data['accel_mag'] = np.sqrt(data['x']**2 + data['y']**2 + data['z']**2) # absolute accel magnitude
    data['accel_mag'] = data['accel_mag'] - data['accel_mag'].mean() # detrend: "remove gravity"

    return data

In [3]:
#Do not modify
def remove_noise(data,sampling_rate):
    from scipy.signal import butter, filtfilt, find_peaks

    # Low pass filter
    cutoff = 5 # Hz
    order = 2
    b, a = butter(order, cutoff/(sampling_rate/2), btype='lowpass')
    data['filtered_accel_mag'] = filtfilt(b, a, data['accel_mag'])

    return data

In [4]:
def detect_peaks(data,sampling_rate,ht,pm,dist):
    # detect peaks
    peak_indices = find_peaks(data['accel_mag'], height=ht, prominence=pm, distance=dist * sampling_rate)[0]
    peaks = data['accel_mag'][peak_indices]

    # add new column
    data['peaks'] = peaks

    return data

In [5]:
def add_features(window):
    features = {}
    features['avg'] = window['filtered_accel_mag'].mean()
    features['max'] = window['filtered_accel_mag'].max()  # Using max() instead of quantile
    features['med'] = window['filtered_accel_mag'].median()  # Using median() for the median value
    features['min'] = window['filtered_accel_mag'].min()  # Using min() for the minimum value
    features['q25'] = window['filtered_accel_mag'].quantile(0.25)
    features['q75'] = window['filtered_accel_mag'].quantile(0.75)
    features['std'] = window['filtered_accel_mag'].std()
    
    return features

def find_peaks_in_data(data, sensor_type, sampling_rate, ht=0.2, pm=0.1, dist=0.1):
    # Select the appropriate signal (accelerometer or gyroscope)
    if sensor_type == 'accelerometer':
        signal = data['filtered_accel_mag']  # For accelerometer, use accel_mag
    elif sensor_type == 'gyroscope':
        signal = data['filtered_gyro_mag']  # For gyroscope, use gyro_mag

    # Find peaks
    peak_indices, _ = find_peaks(signal, height=ht, prominence=pm, distance=dist * sampling_rate)
    peaks = signal.iloc[peak_indices]
    
    # Return peak indices and peak values
    return peak_indices, peaks

def add_peak_features(data, sensor_type, sampling_rate):
    """Detect peaks and extract peak-related features"""
    # Detect peaks in the specified sensor data
    peak_indices, peaks = find_peaks_in_data(data, sensor_type, sampling_rate)

    # Add peak features to the feature dictionary
    peak_features = {}
    if len(peaks) > 0:
        peak_features['peak_count'] = len(peaks)  # Number of detected peaks
        peak_features['avg_peak_height'] = peaks.mean()  # Average height of the peaks
        peak_features['max_peak_height'] = peaks.max()  # Maximum peak height

        # Calculate time between consecutive peaks
        peak_times = data.iloc[peak_indices].index
        peak_intervals = peak_times.diff().dropna().total_seconds()  # Time difference between consecutive peaks
        if len(peak_intervals) > 0:
            peak_features['avg_time_between_peaks'] = peak_intervals.mean()  # Average time between peaks
            peak_features['std_time_between_peaks'] = peak_intervals.std()  # Std dev of time between peaks
        else:
            peak_features['avg_time_between_peaks'] = np.nan
            peak_features['std_time_between_peaks'] = np.nan
    else:
        # If no peaks are detected, set the features to NaN or 0
        peak_features['peak_count'] = 0
        peak_features['avg_peak_height'] = np.nan
        peak_features['max_peak_height'] = np.nan
        peak_features['avg_time_between_peaks'] = np.nan
        peak_features['std_time_between_peaks'] = np.nan

    return peak_features

In [6]:
def extract_features(data, window_sec, sample_rate):
    if 'time' in data.columns:
        data['time'] = pd.to_datetime(data['time'])
        data = data.set_index('time')

    frame_list = []
    sample_count = int(window_sec * sample_rate)

    for t, window in data.resample(f"{window_sec}S"):
        if len(window) < sample_count:
            continue

        # Extract basic features (avg, max, min, etc.)
        basic_features = add_features(window)

        # Extract peak-related features for accelerometer
        accel_peak_features = add_peak_features(window, 'accelerometer', sample_rate)

        # Extract peak-related features for gyroscope
        gyro_peak_features = add_peak_features(window, 'gyroscope', sample_rate)

        # Combine the features
        frame = {**basic_features, **accel_peak_features, **gyro_peak_features}

        # Add time and other metadata
        frame['time'] = t

        frame_list.append(frame)

    resampled_data = pd.DataFrame(frame_list)
    return resampled_data



In [7]:
def all_data_to_combined_csv(root, output_filename='pilates_data2.csv'):
    all_data = pd.DataFrame()

    files = glob.glob(os.path.join(root, '**', '*.csv'), recursive=True)

    for file in files:
        # Determine sensor type
        if 'accelerometer' in file.lower():
            sensor_type = 'accelerometer'
        elif 'gyroscope' in file.lower():
            sensor_type = 'gyroscope'
        else:
            print(f"Skipping unknown file type: {file}")
            continue

        # Read the CSV file
        df = pd.read_csv(file)

        # Calculate magnitude and remove noise for accelerometer data
        sampling_rate = 100  # Adjust based on your actual sampling rate
        window_sec = 2  # Set desired time window for feature extraction
        df = calc_magnitude(df)
        df = remove_noise(df, sampling_rate)

        # Extract metadata from file path
        parts = os.path.normpath(file).split(os.sep)
        exercise = parts[-3]
        pace_raw = parts[-2]
        pace = pace_raw.split('-')[0]

        # Extract features (including peak features)
        features = extract_features(df, window_sec=window_sec, sample_rate=sampling_rate)

        # Add peak-related features (for both accelerometer and gyroscope)
        if sensor_type == 'accelerometer':
            accel_peak_features = add_peak_features(df, 'accelerometer', sampling_rate)
            features = {**features, **accel_peak_features}  # Merge peak features into main features
        elif sensor_type == 'gyroscope':
            gyro_peak_features = add_peak_features(df, 'gyroscope', sampling_rate)
            features = {**features, **gyro_peak_features}  # Merge peak features into main features

        # Add metadata to the features DataFrame
        features['exercise'] = exercise
        features['pace'] = pace
        features['sensor_type'] = sensor_type

        # Append features to all_data
        all_data = pd.concat([all_data, pd.DataFrame([features])], ignore_index=True)

    # Save the combined data to a CSV
    output_path = os.path.join(root, output_filename)
    all_data.to_csv(output_path, index=False)
    print(f"Combined data saved to {output_path}")


In [8]:
def train_decision_tree(frames):
    # Include both basic and peak-related features
    X = frames[['avg', 'max', 'med', 'min', 'q25', 'q75', 'std', 
                'peak_count', 'avg_peak_height', 'max_peak_height', 
                'avg_time_between_peaks', 'std_time_between_peaks']]

    # Extract target column (pace)
    y = frames['pace']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Create model
    dt_model = DecisionTreeClassifier(criterion='entropy', max_depth=5)
    dt_model.fit(X_train, y_train)

    # Predict on test set
    dt_pred = dt_model.predict(X_test)

    # Evaluate on test set
    acc = dt_model.score(X_test, y_test)

    # Print classification report
    print(classification_report(y_test, dt_pred))
    print("Accuracy on test set:", acc)

    return dt_model, acc


In [None]:
data = pd.read_csv('./data/pilates_data2.csv')

data['pace'] = data['pace'].map({'correct': 0, 'incorrect': 1})
feature_columns = ['avg', 'max', 'med', 'min', 'q25', 'q75', 'std']
data[feature_columns] = data[feature_columns].apply(
    lambda x: x.str.replace(r'^\d+\s+', '', regex=True) 
                .str.replace(r'\nName:.*', '', regex=True) 
                .astype(float)
)

print(data[feature_columns].head())