In [40]:
import os
import pandas as pd
import csv
import numpy as np
import math
from scipy import signal
import resampy


annotated_data_path = "/home/yacine/accel/capture24/participants/"
labels_dict_location = "/home/aayush/accelerometer/accprocess/anno-label.csv"

In [41]:
def create_labels_dict():
    labels_dict = {}
    with open(labels_dict_location, "r") as annotation_dict:
        reader = csv.DictReader(annotation_dict)
        for row in reader:
            if labels_dict.get(row['annotation']) is None:
                labels_dict[row['annotation']] = [row['label:Walmsley2020']]
            else:
                labels_dict[row['annotation']].append(row['label:Walmsley2020'])
    return labels_dict

labels_dict = create_labels_dict()

In [42]:
def get_files(data_path, predicted_files=False, annotated_files=False) -> None:
    total_csv_zipped = []
    for path, dirnames, filenames in os.walk(data_path):

        for file in filenames:
            if file.endswith(".csv.gz") and predicted_files:
                # paths contains the base directory for that file.
                # dirnames contains other directories within this folder.
                # filenames contains the list of filenames within path.
                total_csv_zipped.append(os.path.join(path, file))
                
            if file.endswith(".csv") and annotated_files and file[0]!='c': #ignore the capture24 file
                # paths contains the base directory for that file.
                # dirnames contains other directories within this folder.
                # filenames contains the list of filenames within path.
                total_csv_zipped.append(os.path.join(path, file))

    return sorted(total_csv_zipped)

In [43]:
def parse_datetime(dt_string):
    clean_datetime_str = dt_string.split('[')[0].strip()
    dt_object = pd.to_datetime(clean_datetime_str)
    return dt_object

In [44]:
##predefined filter coefficients, as found by Jan Brond
A_coeff = np.array(
    [1, -4.1637, 7.5712,-7.9805, 5.385, -2.4636, 0.89238, 0.06361, -1.3481, 2.4734, -2.9257, 2.9298, -2.7816, 2.4777,
     -1.6847, 0.46483, 0.46565, -0.67312, 0.4162, -0.13832, 0.019852])
B_coeff = np.array(
    [0.049109, -0.12284, 0.14356, -0.11269, 0.053804, -0.02023, 0.0063778, 0.018513, -0.038154, 0.048727, -0.052577,
     0.047847, -0.046015, 0.036283, -0.012977, -0.0046262, 0.012835, -0.0093762, 0.0034485, -0.00080972, -0.00019623])

def pptrunc(data, max_value):
    '''
    Saturate a vector such that no element's absolute value exceeds max_abs_value.
    Current name: absolute_saturate().
      :param data: a vector of any dimension containing numerical data
      :param max_value: a float value of the absolute value to not exceed
      :return: the saturated vector
    '''
    outd = np.where(data > max_value, max_value, data)
    return np.where(outd < -max_value, -max_value, outd)

def trunc(data, min_value):
  
    '''
    Truncate a vector such that any value lower than min_value is set to 0.
    Current name zero_truncate().
    :param data: a vector of any dimension containing numerical data
    :param min_value: a float value the elements of data should not fall below
    :return: the truncated vector
    '''

    return np.where(data < min_value, 0, data)

def runsum(data, length, threshold):
    '''
    Compute the running sum of values in a vector exceeding some threshold within a range of indices.
    Divides the data into len(data)/length chunks and sums the values in excess of the threshold for each chunk.
    Current name run_sum().
    :param data: a 1D numerical vector to calculate the sum of
    :param len: the length of each chunk to compute a sum along, as a positive integer
    :param threshold: a numerical value used to find values exceeding some threshold
    :return: a vector of length len(data)/length containing the excess value sum for each chunk of data
    '''
    
    N = len(data)
    cnt = int(math.ceil(N/length))

    rs = np.zeros(cnt)

    for n in range(cnt):
        for p in range(length*n, length*(n+1)):
            if p<N and data[p]>=threshold:
                rs[n] = rs[n] + data[p] - threshold

    return rs

def count_by_average(data, filesf):
    # Number of samples per 30 seconds
    samples_per_30s = 30 * filesf

    # Resample by averaging each 30-second window
    resampled_data = [np.mean(data[i:i + samples_per_30s]) for i in range(0, len(data), samples_per_30s)]
    
    return resampled_data    
    
def counts(data, filesf, B=B_coeff, A=A_coeff):
    '''
    Get activity counts for a set of accelerometer observations.
    First resamples the data frequency to 30Hz, then applies a Butterworth filter to the signal, then filters by the
    coefficient matrices, saturates and truncates the result, and applies a running sum to get the final counts.
    Current name get_actigraph_counts()
    :param data: the vertical axis of accelerometer readings, as a vector
    :param filesf: the number of observations per second in the file
    :param a: coefficient matrix for filtering the signal, as found by Jan Brond
    :param b: coefficient matrix for filtering the signal, as found by Jan Brond
    :return: a vector containing the final counts
    '''
    
    deadband = 0.068
    sf = 30
    peakThreshold = 2.13
    adcResolution = 0.0164
    integN = 10
    gain = 0.965

    if filesf>sf:
        data = resampy.resample(np.asarray(data), filesf, sf)

    B2, A2 = signal.butter(4, np.array([0.01, 7])/(sf/2), btype='bandpass')
    dataf = signal.filtfilt(B2, A2, data)

    B = B * gain

    #NB: no need for a loop here as we only have one axis in array
    fx8up = signal.lfilter(B, A, dataf)

    fx8 = pptrunc(fx8up[::3], peakThreshold) #downsampling is replaced by slicing with step parameter

    return runsum(np.floor(trunc(np.abs(fx8), deadband)/adcResolution), integN, 0)

def aggregate(src_files):

    csv_files = next(os.walk(src_files))[2]

    freq_per_sec = 1 # Run sum combined the values exceeding the threshold using window size of 10hz
    # making the output 1 record per 1 second.
    # combining using 30 seconds window
    lines_sum = freq_per_sec * 30
    # /home/aayush/accelerometer/cwa-csv/count_sec_python/30hz/30hz1049632_90001_0_0.cwa.csv

    dest_path = os.path.sep.join(src_files.split(os.path.sep)[:-1]+["aggregated"])
    os.makedirs(dest_path, exist_ok=True)

    for data_file in csv_files:
        with open(os.path.join(src_files, data_file), "r") as csv_data:
            with open(os.path.join(dest_path, data_file), "w", newline="") as csv_out:
                fieldnames = ["axis1","axis2","axis3"]
                writer = csv.DictWriter(csv_out, fieldnames=fieldnames)
                writer.writeheader()
                        
                reader = csv.DictReader(csv_data)
                sum_axes = [0, 0, 0]
                count = 0
                
                for row in reader:                
                    if count == lines_sum:
                        writer.writerow({"axis1": sum_axes[0], "axis2": sum_axes[1], "axis3": sum_axes[2]})
                        sum_axes = [0, 0, 0]
                        count = 0

                    sum_axes[0] += float(row["axis1"])
                    sum_axes[1] += float(row["axis2"])
                    sum_axes[2] += float(row["axis3"])
                    count += 1
                
                # write the last remaining values
                writer.writerow({"axis1": sum_axes[0], "axis2": sum_axes[1], "axis3": sum_axes[2]})
            

                

def main(file, folderInn, actual_path, filesf, per_sec_csv):
    
    '''
    Creates activity counts per second from raw acceleromter data (g-units) 
    This function:
      - reads in data into a pandas dataFrame
      - Calculates activity counts per axis
      - combines the axis in a pandas dataFrame
    :param file: file name of both input and output file
    :param folderInn: directory with input files, containing raw accelerometer data
    :param folderOut: directory with out files, containing activity counts.
    :param filesf: sampling frequency of raw accelerometer data
    :return: none (writes .csv file instead)
    '''

    dt = pd.read_csv(file)
    # Take the timestamp after every thirty seconds
    actual_labels = dt[["annotation", "time"]][0::3000]
    actual_labels["time"] = actual_labels["time"].apply(parse_datetime)    
    flat_dict = {k: v[0] for k, v in labels_dict.items()}
    actual_labels['annotation'].replace(flat_dict, inplace=True)
    

    # calculate counts per axis
    c1_1s = counts(dt["x"], filesf)
    c2_1s = counts(dt["y"], filesf)
    c3_1s = counts(dt["z"], filesf)
    
    
    # combine counts in pandas dataFrame
    c_1s = pd.DataFrame(data = {'axis1' : c1_1s, 'axis2' : c2_1s, 'axis3' : c3_1s})
    # c_1s = c_1s.astype(int)

    filename = file.split("/")[-1]
    # write to output folder
    c_1s.to_csv(os.path.join(per_sec_csv, filename), sep=',', index = False)
    
    actual_labels.to_csv(os.path.join(actual_path, f"actual_{filename}"), sep=',', index = False) 

In [45]:
folderInn = annotated_data_path
folderOut = "/home/aayush/accelerometer/accprocess/data_saved/PA3"
filesf = 100
per_sec_csv = os.path.join(folderOut, "per_sec")
os.makedirs(per_sec_csv)

actual_path = os.path.join(folderOut, "actual")
os.makedirs(actual_path)
    
annotated_data_files = get_files(data_path=annotated_data_path, annotated_files=True)

# Loop over .csv files
[main(file, folderInn, actual_path, filesf, per_sec_csv) for file in annotated_data_files]
aggregate(src_files=per_sec_csv)

  dt = pd.read_csv(file)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  actual_labels['annotation'].replace(flat_dict, inplace=True)
  dt = pd.read_csv(file)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  actual_labels['annotation'].replace(flat_dict, inplace=True)
  dt = pd.read_csv(file)
The behavior will change in pandas 

In [46]:
aggregated_files = get_files("/home/aayush/accelerometer/accprocess/data_saved/PA3/aggregated", annotated_files=True)
actual_files = get_files("/home/aayush/accelerometer/accprocess/data_saved/PA3/actual", annotated_files=True)

for actual, aggregated in zip(actual_files, aggregated_files):
    actual_id = int(actual.split("/")[-1].split(".")[0].split("_")[-1][1:])
    aggregated_id = int(aggregated.split("/")[-1].split(".")[0][1:])
    assert actual_id==aggregated_id, "ids do not match"
    
    actual_df = pd.read_csv(actual)
    aggregated_df = pd.read_csv(aggregated)
    
    # Perform the left join based on the index of actual
    joined_df = actual_df.merge(aggregated_df, left_index=True, right_index=True, how='left')
    joined_df = joined_df.dropna(subset=["axis1"])
    
    joined_df.to_csv(os.path.join("/home/aayush/accelerometer/accprocess/data_saved/PA3/combined", f"{actual_id}.csv"), index=False)
    
    