In [16]:
import os
import pandas as pd
import csv
import numpy as np
import time

from datetime import datetime
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

predicted_data_path = "/home/aayush/accelerometer/accprocess/results"
predicted_data_path_custom_model = "/home/aayush/accelerometer/accProcess-custom/results/time-series"
annotated_data_path = "/home/yacine/accel/capture24/participants/"

In [17]:
def get_files(data_path, predicted_files=False, annotated_files=False) -> None:
    total_csv_zipped = []
    for path, dirnames, filenames in os.walk(data_path):
        for file in filenames:
            if file.endswith(".csv.gz") and predicted_files:
                # paths contains the base directory for that file.
                # dirnames contains other directories within this folder.
                # filenames contains the list of filenames within path.
                total_csv_zipped.append(os.path.join(path, file))
                
            if file.endswith(".csv") and annotated_files and file[0]!='c': #ignore the capture24 file
                # paths contains the base directory for that file.
                # dirnames contains other directories within this folder.
                # filenames contains the list of filenames within path.
                total_csv_zipped.append(os.path.join(path, file))
                
    return sorted(total_csv_zipped)

In [18]:
def parse_datetime(dt_string):
    clean_datetime_str = dt_string.split('[')[0].strip()
    dt_object = pd.to_datetime(clean_datetime_str)
    return dt_object


def parse_datetime_df_time(dt_string):
    clean_datetime_str = dt_string.split('[')[0].strip()
    clean_datetime_str = clean_datetime_str.split('+')[0].strip()
    dt_object = pd.to_datetime(clean_datetime_str)
    return dt_object

In [19]:
predicted_data_files = get_files(data_path=predicted_data_path_custom_model, predicted_files=True)
annotated_data_files = get_files(data_path=annotated_data_path, annotated_files=True)
assert len(predicted_data_files) == len(annotated_data_files), "Number of predicted and annotated data files should be the same."

In [20]:
labels_dict_location = "/home/aayush/accelerometer/accprocess/anno-label.csv"

def create_labels_dict():
    labels_dict = {}
    with open(labels_dict_location, "r") as annotation_dict:
        reader = csv.DictReader(annotation_dict)
        for row in reader:
            if labels_dict.get(row['annotation']) is None:
                labels_dict[row['annotation']] = [row['label:Walmsley2020']]
            else:
                labels_dict[row['annotation']].append(row['label:Walmsley2020'])
    return labels_dict

labels_dict = create_labels_dict()

In [21]:
def process_annotated_data(annotated_data_file):
    annotated_data = pd.read_csv(annotated_data_file)
    # Take the timestamp after every thirty seconds
    actual_labels = annotated_data[["annotation", "time"]][0::3000]
    
    # Convert to datetime object
    actual_time = actual_labels["time"].apply(parse_datetime)
    actual_labels["time"] = actual_time

    # replace the annotated labels with the same format of strings as predicted labels using mapping from labels_dict
    flat_dict = {k: v[0] for k, v in labels_dict.items()}
    
    actual_labels['annotation'].replace(flat_dict, inplace=True)    
    
    return actual_labels

In [22]:
all_epoch_dfs_predicted = []
all_epoch_dfs_annotated = []
for predicted_data_file, annotated_data_file in zip(predicted_data_files, annotated_data_files):
    annotated_df = process_annotated_data(annotated_data_file)
    predicted_df = pd.read_csv(predicted_data_file)
    
    # Convert to datetime object
    df_cleaned_time = predicted_df['time'].apply(parse_datetime_df_time)
    predicted_df['time'] = df_cleaned_time
    
    # retrieve only the activities headers
    activities = predicted_df[predicted_df.columns.values.tolist()[2:6]]
    
    # whichever activity is depicted by 1, use it as the predicted activity
    out = activities[activities==1].idxmax(axis=1)
    predicted_df["activity_predicted"] = out
    
    annotated_df["participant"] = float(annotated_data_file.split("/")[-1].split(".")[0][1:])
    predicted_df_subset = predicted_df[['time', 'MET', 'activity_predicted', 'CpSB', 'CpLPA', 'CpMPA', 'CpVPA', 'CpMVPA']]
    predicted_df_subset["participant"] = float(predicted_data_file.split("/")[-1].split("-")[0][1:])
    
    # annotated_df = pd.merge_asof(annotated_df, predicted_df_subset, on="time")
    # annotated_df["MET"] = predicted_df["MET"]
    # annotated_df["activity_predicted"] = predicted_df["activity_predicted"]
    print(f"Done for: {annotated_data_file}, {predicted_data_file}.")
    all_epoch_dfs_annotated.append(annotated_df)
    all_epoch_dfs_predicted.append(predicted_df_subset)

  annotated_df = process_annotated_data(annotated_data_file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df_subset["participant"] = float(predicted_data_file.split("/")[-1].split("-")[0][1:])


Done for: /home/yacine/accel/capture24/participants/P001.csv, /home/aayush/accelerometer/accProcess-custom/results/time-series/P001-timeSeries.csv.gz.
Done for: /home/yacine/accel/capture24/participants/P002.csv, /home/aayush/accelerometer/accProcess-custom/results/time-series/P002-timeSeries.csv.gz.
Done for: /home/yacine/accel/capture24/participants/P003.csv, /home/aayush/accelerometer/accProcess-custom/results/time-series/P003-timeSeries.csv.gz.
Done for: /home/yacine/accel/capture24/participants/P004.csv, /home/aayush/accelerometer/accProcess-custom/results/time-series/P004-timeSeries.csv.gz.
Done for: /home/yacine/accel/capture24/participants/P005.csv, /home/aayush/accelerometer/accProcess-custom/results/time-series/P005-timeSeries.csv.gz.
Done for: /home/yacine/accel/capture24/participants/P006.csv, /home/aayush/accelerometer/accProcess-custom/results/time-series/P006-timeSeries.csv.gz.
Done for: /home/yacine/accel/capture24/participants/P007.csv, /home/aayush/accelerometer/accPr

In [23]:
epochs_predicted = pd.concat(all_epoch_dfs_predicted)
epochs_annotated = pd.concat(all_epoch_dfs_annotated)

In [24]:
epochs_predicted.to_csv("/home/aayush/accelerometer/predicted_custom_all.csv", index=False)
epochs_annotated.to_csv("/home/aayush/accelerometer/annotated_custom_all.csv", index=False)