In [1]:
import os 
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
from sklearn.metrics import f1_score, balanced_accuracy_score,precision_score, recall_score
import matplotlib.pyplot as plt

In [2]:
results_path = '/mnt/Jake/results_mobilise_ionescu_full_redo/'

In [3]:
subject_ids = [p for p in os.listdir(results_path) if p[0] != '.']

In [4]:
data_dir = "/domino/datasets/local/dataset/idea_fast/for_s3/"

In [5]:
def plot_lists(truth, predictions):
    indices = list(range(len(truth)))
    plt.figure(figsize=(10, 5))
    plt.plot(indices, truth, label='Truth', color='blue', marker='o', linestyle='-', markersize=6)
    plt.plot(indices, predictions, label='Predictions', color='red', marker='x', linestyle='--', markersize=6)
    plt.title('Truth vs Predictions')
    plt.xlabel('Index')
    plt.ylabel('0 or 1')
    plt.grid(True)

    plt.show()

In [8]:
len_guess = 119790
examples_seen = 1
f1s = {}
import os
import re
import pandas as pd

import os
import re
import pandas as pd

def process_single_subject(subjects_dir, subject_folder):
    """
    Process CSV files for a single subject folder and return the concatenated data.

    Parameters:
    - subjects_dir (str): Path to the directory containing subject folders.
    - subject_folder (str): The name of the specific subject folder to process.

    Returns:
    - pd.DataFrame: Concatenated DataFrame of all relevant CSV data for the subject, 
      or None if there are no relevant files or an error occurs.
    """
    subject_path = os.path.join(subjects_dir, subject_folder)
    
    # Ensure it's a directory
    if not os.path.isdir(subject_path):
        print(f"{subject_folder} is not a valid directory.")
        return None
    
    print(f"\nProcessing subject folder: {subject_folder}")
    
    # Find relevant CSV files in the folder based on the pattern
    csv_files = [
        f for f in os.listdir(subject_path)
        if re.match(rf"{subject_folder}-\w{{9}}-\d{{8}}-\d{{8}}_omx_mapped_mapped\.csv", f)
    ]
    
    # Check if any CSV files were found
    if not csv_files:
        print(f"No relevant CSV files found in {subject_folder}.")
        return None
    
    print(f"Found {len(csv_files)} relevant CSV file(s): {csv_files}")
    
    # Initialize an empty list to store data from each file
    subject_data = []

    
    # Process each relevant CSV file
    for csv_file in csv_files:
        file_path = os.path.join(subject_path, csv_file)
        print(f"Processing file: {csv_file}")
        df = pd.read_csv(file_path)
        # Preprocess the file and append the data
        
        subject_data.append(df)
    
    # Concatenate subject_data into one DataFrame if there is data
    if subject_data:
        all_data = pd.concat(subject_data, ignore_index=True)
        print(f"Concatenated data for {subject_folder} with {len(all_data)} rows.")
        return all_data
    else:
        print(f"No data to concatenate for {subject_folder}.")
        return None

                
num_subjects = 0
for subject in subject_ids:
    print(f"Loading {subject} labels...")
    label_list = []
    try:
        label_list = process_single_subject(data_dir, subject)['mapped_value'].values
        num_subjects +=1
    except:
        continue
    label_list = process_single_subject(data_dir, subject)['mapped_value'].values
    """with pd.read_csv(data_dir + subject + '/mapped_df.csv', chunksize=1000) as label_reader:
        chunk_counter = 0
        for chunk in tqdm(label_reader, total=len_guess):
            chunk_counter += 1
            label_list.extend(pd.DataFrame(chunk)['mapped_value'].values)"""
    #examples_seen += 1
    #len_guess = (len_guess + chunk_counter) / examples_seen

    label_series = pd.Series(label_list)
    #print("Outputting label distribution...")
    #print(label_series.value_counts())
    activity_mapping = {
        'not_worn':0,
        'lying':0,
        'standing':0,        
        'sitting':0,
        'walking':1,
        'running':1,
        'shuffling':1,
        'stair_walking':1
    }
    
    truth_list = label_series.map(activity_mapping).fillna(0).astype(int).tolist()
    
    pred_idx_df = pd.read_csv(results_path + subject + '/gs_list.csv')
    pred_list = [0] * (len(label_list))
    for ix, row in pred_idx_df.iterrows():
        pred_list[row['start']:row['end']+1] = [1] * (row['end']-row['start']+1)
    diff = 0
    truth_array = np.array(truth_list)
    pred_array = np.array(pred_list)
    matching_indices = np.where(truth_array == pred_array)[0]
    print("accuracy: ", len(matching_indices)/len(pred_array), "num matches: " ,len(matching_indices))
    print("matches at: ", matching_indices)
    if len(truth_list) == len(pred_list):
        f1 = f1_score(truth_list, pred_list)
        precision = precision_score(truth_list, pred_list)
        recall = recall_score(truth_list, pred_list)
        accuracy = balanced_accuracy_score(truth_list, pred_list)
        
        #plot_lists(truth_list, pred_list)
    else:
        diff = len(truth_list) - len(pred_list)
        print(f"Misalignment of {diff} between truth and prediction detected!")
        if diff < 0:
            diff = abs(diff)
            f1_trim_front = f1_score(truth_list, pred_list[diff:])
            f1_trim_back = f1_score(truth_list, pred_list[:-diff])
            
            accuracy_trim_front = balanced_accuracy_score(truth_list, pred_list[diff:])
            accuracy_trim_back = balanced_accuracy_score(truth_list, pred_list[:-diff])
            
            #plot_lists(truth_list, pred_list[diff:])
        else:
            f1_trim_front = f1_score(truth_list[diff:], pred_list)            
            f1_trim_back = f1_score(truth_list[:-diff], pred_list)
            
            accuracy_trim_front = balanced_accuracy_score(truth_list[diff:], pred_list)
            accuracy_trim_back = balanced_accuracy_score(truth_list[:-diff], pred_list)
            
            #plot_lists(truth_list[diff:], pred_list)
        
        f1 = max(f1_trim_front, f1_trim_back)
        accuracy = max(accuracy_trim_front, accuracy_trim_back)
    

    f1s[subject] = f1
    #print(f"The truth looks like: {pd.Series(truth_list).value_counts()}")
    #print(f"The predictions looks like: {pd.Series(pred_list).value_counts()}")
    print(f"Balanced Accuracy for {subject}: {f1:.2f}")
    print(f"F1 for {subject}: {f1:.2f}")
    print(f"Precision for {subject}: {precision:.2f}")
    print(f"Recall for {subject}: {recall:.2f}")
print(num_subjects)

Loading NNF2HPY labels...

Processing subject folder: NNF2HPY
Found 2 relevant CSV file(s): ['NNF2HPY-MMMZA3GVX-20210118-20210124_omx_mapped_mapped.csv', 'NNF2HPY-MMMXHG3EE-20210125-20210131_omx_mapped_mapped.csv']
Processing file: NNF2HPY-MMMZA3GVX-20210118-20210124_omx_mapped_mapped.csv
Processing file: NNF2HPY-MMMXHG3EE-20210125-20210131_omx_mapped_mapped.csv
Concatenated data for NNF2HPY with 103680000 rows.

Processing subject folder: NNF2HPY
Found 2 relevant CSV file(s): ['NNF2HPY-MMMZA3GVX-20210118-20210124_omx_mapped_mapped.csv', 'NNF2HPY-MMMXHG3EE-20210125-20210131_omx_mapped_mapped.csv']
Processing file: NNF2HPY-MMMZA3GVX-20210118-20210124_omx_mapped_mapped.csv
Processing file: NNF2HPY-MMMXHG3EE-20210125-20210131_omx_mapped_mapped.csv
Concatenated data for NNF2HPY with 103680000 rows.
accuracy:  0.9702412422839506 num matches:  100594612
matches at:  [        0         1         2 ... 103679997 103679998 103679999]
Balanced Accuracy for NNF2HPY: 0.77
F1 for NNF2HPY: 0.77
Prec

  matching_indices = np.where(truth_array == pred_array)[0]


accuracy:  0.0 num matches:  0
matches at:  []
Misalignment of -1178457 between truth and prediction detected!
Balanced Accuracy for NDRPH9Q: 0.82
F1 for NDRPH9Q: 0.82
Precision for NDRPH9Q: 0.68
Recall for NDRPH9Q: 0.81
Loading NEP7AP2 labels...

Processing subject folder: NEP7AP2
No relevant CSV files found in NEP7AP2.
Loading NXQEXRT labels...

Processing subject folder: NXQEXRT
Found 1 relevant CSV file(s): ['NXQEXRT-MMMXHXEE6-20210127-20210204_omx_mapped_mapped.csv']
Processing file: NXQEXRT-MMMXHXEE6-20210127-20210204_omx_mapped_mapped.csv
Concatenated data for NXQEXRT with 69120000 rows.

Processing subject folder: NXQEXRT
Found 1 relevant CSV file(s): ['NXQEXRT-MMMXHXEE6-20210127-20210204_omx_mapped_mapped.csv']
Processing file: NXQEXRT-MMMXHXEE6-20210127-20210204_omx_mapped_mapped.csv
Concatenated data for NXQEXRT with 69120000 rows.


  matching_indices = np.where(truth_array == pred_array)[0]


accuracy:  0.0 num matches:  0
matches at:  []
Misalignment of -3963190 between truth and prediction detected!
Balanced Accuracy for NXQEXRT: 0.28
F1 for NXQEXRT: 0.28
Precision for NXQEXRT: 0.68
Recall for NXQEXRT: 0.81
Loading KXRRZ65 labels...

Processing subject folder: KXRRZ65
No relevant CSV files found in KXRRZ65.
Loading NQA4Q9E labels...

Processing subject folder: NQA4Q9E
Found 2 relevant CSV file(s): ['NQA4Q9E-MMMSDPQCX-20210224-20210302_omx_mapped_mapped.csv', 'NQA4Q9E-MMM7H2TA5-20210303-20210309_omx_mapped_mapped.csv']
Processing file: NQA4Q9E-MMMSDPQCX-20210224-20210302_omx_mapped_mapped.csv
Processing file: NQA4Q9E-MMM7H2TA5-20210303-20210309_omx_mapped_mapped.csv
Concatenated data for NQA4Q9E with 103678000 rows.

Processing subject folder: NQA4Q9E
Found 2 relevant CSV file(s): ['NQA4Q9E-MMMSDPQCX-20210224-20210302_omx_mapped_mapped.csv', 'NQA4Q9E-MMM7H2TA5-20210303-20210309_omx_mapped_mapped.csv']
Processing file: NQA4Q9E-MMMSDPQCX-20210224-20210302_omx_mapped_mapped.c

  df = pd.read_csv(file_path)


Concatenated data for N5DP9SQ with 103680000 rows.

Processing subject folder: N5DP9SQ
Found 2 relevant CSV file(s): ['N5DP9SQ-MMMGGSD5V-20210308-20210314_omx_mapped_mapped.csv', 'N5DP9SQ-MMM266KGE-20210301-20210307_omx_mapped_mapped.csv']
Processing file: N5DP9SQ-MMMGGSD5V-20210308-20210314_omx_mapped_mapped.csv
Processing file: N5DP9SQ-MMM266KGE-20210301-20210307_omx_mapped_mapped.csv


  df = pd.read_csv(file_path)


Concatenated data for N5DP9SQ with 103680000 rows.
accuracy:  0.94385078125 num matches:  97858449
matches at:  [        0         1         2 ... 103679997 103679998 103679999]
Balanced Accuracy for N5DP9SQ: 0.07
F1 for N5DP9SQ: 0.07
Precision for N5DP9SQ: 0.06
Recall for N5DP9SQ: 0.08
Loading NTDGTR9 labels...

Processing subject folder: NTDGTR9
Found 2 relevant CSV file(s): ['NTDGTR9-MMMSTF39P-20210301-20210307_omx_mapped_mapped.csv', 'NTDGTR9-MMM4APZZR-20210222-20210228_omx_mapped_mapped.csv']
Processing file: NTDGTR9-MMMSTF39P-20210301-20210307_omx_mapped_mapped.csv
Processing file: NTDGTR9-MMM4APZZR-20210222-20210228_omx_mapped_mapped.csv
Concatenated data for NTDGTR9 with 103680000 rows.

Processing subject folder: NTDGTR9
Found 2 relevant CSV file(s): ['NTDGTR9-MMMSTF39P-20210301-20210307_omx_mapped_mapped.csv', 'NTDGTR9-MMM4APZZR-20210222-20210228_omx_mapped_mapped.csv']
Processing file: NTDGTR9-MMMSTF39P-20210301-20210307_omx_mapped_mapped.csv
Processing file: NTDGTR9-MMM4APZ