In [None]:
import pandas as pd
import numpy as np
import os
import time
import datetime
from sklearn.metrics import f1_score, roc_auc_score, precision_score

In [None]:
# Paths to data
wrist_dir = "dir"
mapped_dir = "Outputs/Lower Back Predictions Mapped To Wrist"

# List of participants
subjects = os.listdir(mapped_dir)
subjects = [subject for subject in subjects if subject not in ['.ipynb_checkpoints']]

# Columns to load
wrist_columns_to_load = ['accel_x', 'accel_y', 'accel_z']
mapped_columns_to_load = ['index', 'accel_x', 'accel_y', 'accel_z', 'lower_back_mapped_value']

# Maximum rows to process per participant
max_rows_per_participant = 70_000_000

# Initialize empty list to store subject-specific evaluation metrics
subject_eval_scores = []

# Initialized counters for true positives, false positives, and false negatives for ENMO and MAD
tp_enmo = 0
fp_enmo = 0
fn_enmo = 0

tp_mad = 0
fp_mad = 0
fn_mad = 0

# Generate a timestamp for outputs
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Loop through each participant
for subject in subjects:
    print(f"Processing subject: {subject}")
    
    # Start timer for the subject
    subject_start_time = time.time()
    
    df_loop = []

    # Load mapped signal data
    mapped_signal_path = os.path.join(mapped_dir, subject, 'wrist_lower_back_df.csv')
    mapped_pd = pd.read_csv(mapped_signal_path, usecols=mapped_columns_to_load)
    
    # Rename 'lower_back_mapped_value' to 'label'
    mapped_pd.rename(columns={'lower_back_mapped_value': 'label'}, inplace=True)
    
    # Convert accelerometer values to float32
    mapped_pd[['accel_x', 'accel_y', 'accel_z']] = mapped_pd[['accel_x', 'accel_y', 'accel_z']].astype('float32')
    
    # Store mapped indices in a set for fast lookups
    mapped_indices_set = set(mapped_pd['index'])
    
    # Load entire wrist signal data
    whole_signal_path = os.path.join(wrist_dir, subject, 'combined_ax6_df.csv')

    chunk_size = 1_000_000  # Chunk size
    rows_read = 0  # Track the number of rows read
    
    processed_chunks = []

    for chunk_idx, chunk in enumerate(pd.read_csv(whole_signal_path, usecols=wrist_columns_to_load, chunksize=chunk_size)):
        # Start timer for the chunk
        chunk_start_time = time.time()

        # Break the loop if max rows are reached
        if rows_read >= max_rows_per_participant:
            print(f"Reached max rows ({max_rows_per_participant}) for participant {subject}.")
            break
        
        # Create the `index` column for whole signal data
        chunk.reset_index(inplace=True)

        # Drop rows where accelerometer values are NaN
        chunk = chunk.dropna(subset=['accel_x', 'accel_y', 'accel_z']).copy()

        # Filter out rows in chunk that have an index present in mapped_pd
        chunk = chunk[~chunk['index'].isin(mapped_indices_set)]
        
        # Convert accelerometer values to float32
        chunk.loc[:, ['accel_x', 'accel_y', 'accel_z']] = chunk[['accel_x', 'accel_y', 'accel_z']].astype('float32')
            
        # Assign label 0 for non-mapped data
        chunk.loc[:, 'label'] = 0
        
        # Reorder columns to match mapped_df
        chunk = chunk[['index', 'accel_x', 'accel_y', 'accel_z', 'label']]
        
        # Add chunk to the processed list
        processed_chunks.append(chunk)
        
        # Update the row count
        rows_read += len(chunk)
    
    # Combine all chunks and mapped data into a single DataFrame
    df_loop = pd.concat([mapped_pd] + processed_chunks, ignore_index=True)
    
    # Calculate ENMO
    df_loop['ENMO'] = np.sqrt(df_loop['accel_x']**2 + df_loop['accel_y']**2 + df_loop['accel_z']**2) - 1
    df_loop['ENMO'] = df_loop['ENMO'].clip(lower=0)
    
    # Calculate MAD
    df_loop['MAD_l2norm'] = df_loop['ENMO'] + 1
    df_loop['MAD_window_mean'] = df_loop['MAD_l2norm'].rolling(window=500).mean()
    df_loop['MAD_abs'] = (df_loop['MAD_l2norm'] - df_loop['MAD_window_mean']).abs()
    df_loop['ENMO_window'] = df_loop['ENMO'].rolling(window=500).mean()
    df_loop['MAD_window'] = df_loop['MAD_abs'].rolling(window=500).mean()
    
    # Drop unneeded columns
    df_loop.drop(columns=['ENMO','MAD_l2norm', 'MAD_window_mean', 'MAD_abs'], inplace=True)

    # Filter for rows with valid rolling calculations
    df_loop = df_loop.dropna(subset=['ENMO_window', 'MAD_window'])
    
    # Activity classification - specify thresholds here
    df_loop['enmo_label'] = (df_loop['ENMO_window'] > 0.07).astype(int)
    df_loop['mad_label'] = (df_loop['MAD_window'] > 0.05).astype(int)

    
    # Compute true positives, false positives, and false negatives using ENMO
    subject_tp_enmo = len(df_loop[(df_loop['label'] == df_loop['enmo_label']) & df_loop['label'] == 1])
    subject_fp_enmo = len(df_loop[(df_loop['label'] != df_loop['enmo_label']) & df_loop['label'] == 0])
    subject_fn_enmo = len(df_loop[(df_loop['label'] != df_loop['enmo_label']) & df_loop['label'] == 1])
    
    # Compute true positives, false positives, and false negatives using MAD
    subject_tp_mad = len(df_loop[(df_loop['label'] == df_loop['mad_label']) & df_loop['label'] == 1])
    subject_fp_mad = len(df_loop[(df_loop['label'] != df_loop['mad_label']) & df_loop['label'] == 0])
    subject_fn_mad = len(df_loop[(df_loop['label'] != df_loop['mad_label']) & df_loop['label'] == 1])
    
    # Calculate precision scores
    precision_enmo = subject_tp_enmo / (subject_tp_enmo + subject_fp_enmo) if (subject_tp_enmo + subject_fp_enmo) > 0 else 0
    precision_mad = subject_tp_mad / (subject_tp_mad + subject_fp_mad) if (subject_tp_mad + subject_fp_mad) > 0 else 0

    
    # Calculate F1 scores
    try:
        f1_enmo = f1_score(df_loop['label'], df_loop['enmo_label'])
        f1_mad = f1_score(df_loop['label'], df_loop['mad_label'])
    except ValueError:
        f1_enmo, f1_mad = None, None
        
    # Calculate AUROC if there are enough positive and negative samples
    try:
        auroc_enmo = roc_auc_score(df_loop['label'], df_loop['enmo_label'])
        auroc_mad = roc_auc_score(df_loop['label'], df_loop['mad_label'])
    except ValueError:
        auroc_enmo, auroc_mad = None, None
        
    # Store subject-level scores
    subject_eval_scores.append({
        'subject': subject,
        'f1_score_enmo': f1_enmo,
        'f1_score_mad': f1_mad,
        'precision_enmo': precision_enmo,
        'precision_mad': precision_mad,
        'auroc_enmo': auroc_enmo,
        'auroc_mad': auroc_mad
    })
    
    tp_enmo += subject_tp_enmo
    fp_enmo += subject_fp_enmo
    fn_enmo += subject_fn_enmo

    tp_mad += subject_tp_mad
    fp_mad += subject_fp_mad
    fn_mad += subject_fn_mad
    
    # End timer for the subject
    subject_total_time = time.time() - subject_start_time
    print(f"Finished processing subject {subject} in {subject_total_time:.2f} seconds")

# Save subject-level eval scores to a timestamped CSV
subject_eval_scores_df = pd.DataFrame(subject_eval_scores)
output_filename = f'Outputs/ENMO & MAD/Timestamped Outputs/subject_eval_scores_{timestamp}.csv'
subject_eval_scores_df.to_csv(output_filename, index=False)
display(subject_eval_scores_df)

# Compute eval scores across all subjects
precision_combined_enmo = tp_enmo / (tp_enmo + fp_enmo)
recall_combined_enmo = tp_enmo / (tp_enmo + fn_enmo)
precision_combined_mad = tp_mad / (tp_mad + fp_mad)
recall_combined_mad = tp_mad / (tp_mad + fn_mad)

# Print combined Precision scores
print(f"Combined Precision Score ENMO: {precision_combined_enmo}")
print(f"Combined Precision Score MAD: {precision_combined_mad}\n")

# Derive overall F1
f1_combined_enmo = 2*(precision_combined_enmo * recall_combined_enmo)/(precision_combined_enmo + recall_combined_enmo)
f1_combined_mad = 2*(precision_combined_mad * recall_combined_mad)/(precision_combined_mad + recall_combined_mad)

# Print combined F1 scores
print(f"Combined F1 Score ENMO: {f1_combined_enmo}")
print(f"Combined F1 Score MAD: {f1_combined_mad}\n")

overall_eval_scores_df = pd.DataFrame([{
    'f1_score_enmo': f1_combined_enmo,
    'f1_score_mad': f1_combined_mad,
    'precision_enmo': precision_combined_enmo,
    'precision_mad': precision_combined_mad
}])

# Save overall eval scores to a timestamped CSV
overall_output_filename = f'Outputs/ENMO & MAD/Timestamped Outputs/overall_eval_scores_{timestamp}.csv'
overall_eval_scores_df.to_csv(overall_output_filename, index=False)
display(overall_eval_scores_df)