In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from datetime import datetime
import pytz

In [2]:
def create_windowed_data(df, sensor_cols, window_size, step_size, variables_first, all_windows=False, label_list=None, constant_label=None):
    """
    df - pandas dataframe with index corresponding to epoch time in milliseconds
    sensor_cols - columns that should be included in the output sliding windows
    window_size - number of samples (df rows) to use in each window
    step_size - number of rows to step forward before creating the next window
    variables_first - If True, return windows as a 3D array with shape (# samples, # variables, window_size)
                      If False, return windows as a 3D array with shape (# samples, window_size, # variables)
    all_windows - If False then only return windows (and labels and intervals) for labeled intervals (i.e., determined by 
                      label_list or constant_label). If True, then you will also get windows with a corresponding 
                      label of "" for unlabeled data and data that overlapped label boundaries
    label_list - List of format [[StartTimestamp, EndTimestamp, LabelString, Subject], ] that will be used to assign labels 
                      to windows (e.g., used when loading from a file that has multiple intervals of activities with 
                      different classes)
    constant_label - A single label string that should be applied to all windows (e.g., used when loading from a file 
                      that only contains a single class)
    """
    assert step_size <= window_size
    assert step_size > 0

    max_gap = 100 #max gap allowed in sensor data in milliseconds

    windowed_data = []
    intervals = []
    labels = []
    subjects = [] 

    current_window = []
    current_interval = []
    current_labels = []
    last_timestamp = None

    if label_list is not None:
        label_list.sort()
        label_idx = 0

    for row in df.itertuples():
        row_values = [getattr(row, col) for col in sensor_cols ]
        timestamp = getattr(row, "Index")

        if last_timestamp is None:
            last_timestamp = timestamp

        if constant_label is not None:
            label = constant_label
            subject = None
        elif label_list is not None:
            label = ""
            subject = None
            for i in range(label_idx, len(label_list)):
                if timestamp > label_list[i][1]:
                    label_idx += 1
                elif timestamp < label_list[i][0]:
                    break
                elif timestamp >= label_list[i][0] and timestamp <= label_list[i][1]:
                    label = label_list[i][2]
                    if len(label_list[i]) > 3:
                        subject = label_list[i][3]
                    break
        else:
            label = ""
            subject = None

        if not all_windows and len(current_labels) > 0 and label != current_labels[-1]:
            current_window = []
            current_interval = []
            current_labels = []

        if timestamp - last_timestamp > max_gap:
            current_window = []
            current_interval = []
            current_labels = []
        last_timestamp = timestamp

        current_window.append(row_values)
        current_interval.append(timestamp)
        current_labels.append(label)

        if len(current_window) == window_size:
            if all_windows or label != "":
                if variables_first:
                    windowed_data.append(np.transpose(current_window))
                else:
                    windowed_data.append(np.array(current_window))

                intervals.append((current_interval[0], current_interval[-1]))

                if len(set(current_labels)) > 1:
                    labels.append("")
                else:
                    labels.append(label)

                subjects.append(subject)

            for i in range(step_size):
                current_window.pop(0)
                current_interval.pop(0)
                current_labels.pop(0)

    return windowed_data, labels, intervals, subjects

In [3]:
def datetime_merging(primary_df, secondary_df_list):
    """
    primary_df - this df will be used to decide the timestamps used for the combined_df
    secondary_df_list - list of dfs that should be joined into primary_df.
                        Any row with a timestamp that matches a timestamp in primary_df will have its values matched to that primary_df row
                        Any rows in primary_df that don't have a match in the secondary df will have values backfilled to fill the gap.
    """
    combined_df = primary_df.copy(deep=True)

    for df2 in secondary_df_list:
        combined_df = combined_df.join(df2, how="left")
    
    combined_df.fillna(method='bfill', inplace=True)
    combined_df.dropna(inplace=True)
    return combined_df

In [4]:
def load_csv_file_list(csv_file_path_list, cols, prefix=None, sample_rate=None):
    """
    csv_file_path_list - list of csv paths that should all be combined into a single df
    cols - list of columns that should be used when creating the df
    prefix - adds a prefix to sensor column names, useful when multiple sensors have a column with the same name (e.g., X)
    sample_rate - Sample rate (in Hz) to use for the data. If None, it will be estimated from the data
    """

    combined_df = None

    for csv_file_path in csv_file_path_list:
        try:
            df = pd.read_csv(csv_file_path, usecols=cols)
        except ValueError:
            df = pd.read_csv(csv_file_path, names=cols)

        if len(df) == 0:
            print("Empty csv file found: " + str(csv_file_path), flush=True)
            continue

        if prefix is not None:
            rename_cols = {c: prefix + "_" + c for c in cols if c != "time"}
            df.rename(columns=rename_cols, inplace=True)
        
        if "." not in df.at[0, "time"]:
            df.at[0, "time"] = df.at[0, "time"] + ".000"

        #https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#from-timestamps-to-epoch
        stt_time = pd.to_datetime(df.at[0, "time"], format="%Y%m%d_%H:%M:%S.%f").tz_localize('Asia/Tokyo').tz_convert("UTC")
        stt_time = (stt_time - pd.Timestamp("1970-01-01", tz="UTC")) / pd.Timedelta("1s")
        stt_time = round(stt_time, 2) #we're only dealing with up to 100 Hz, this should be updated if higher samping rates are used

        if sample_rate is None:
            if "." not in df.at[len(df["time"])-1, "time"]:
                df.at[len(df["time"])-1, "time"] = df.at[len(df["time"])-1, "time"] + ".000"
            end_time = pd.to_datetime(df.at[len(df["time"])-1, "time"], format="%Y%m%d_%H:%M:%S.%f").tz_localize('Asia/Tokyo').tz_convert("UTC")
            end_time = (end_time - pd.Timestamp("1970-01-01", tz="UTC")) / pd.Timedelta("1s")
            end_time = round(end_time, 2) #we're only dealing with up to 100 Hz, this should be updated if higher samping rates are used
            sample_rate = round(len(df["time"]) / (end_time - stt_time))

        #set time values as milliseconds since epoch
        time_values = np.array([ 1000 * (stt_time + (i/sample_rate)) for i in range(len(df["time"])) ], dtype=np.int64)
        df["time"] = time_values

        if combined_df is None:
            combined_df = df
        else:
            combined_df = pd.concat([combined_df, df])

    if combined_df is None:
        return None
    else:
        combined_df.set_index("time", inplace=True)
        combined_df = combined_df[~combined_df.index.duplicated(keep='last')]
        return combined_df

In [5]:
def trimmed_data_example(trimmed_dir, use_sensors):
    print("trimmed cross scene example")

    trimmed_dir = os.path.join(trimmed_dir, "sensors")

    sensor_dir_list = ["acc_phone_clip", "acc_watch_clip", "gyro_clip", "orientation_clip",]
    subject_list = ["subject" + str(i) for i in range(1, 21)]
    scene_list = ["scene" + str(i) for i in range(1, 5)]
    session_list = ["session" + str(i) for i in range(1, 6)]

    filename_case_map = {} #allows us to catch filenames that have capitalized letters

    class_list = set()
    for sensor_dir in sensor_dir_list:
        for subject in subject_list:
            for scene in scene_list:
                for session in session_list:
                    d = os.path.join(trimmed_dir, sensor_dir, subject, scene, session)
                    if os.path.exists(d):
                        for filename in os.listdir(d):
                            if filename.endswith(".csv"):
                                c = filename.split(".")[0].strip()
                                c_lower = c.lower()
                                if c_lower not in filename_case_map:
                                    filename_case_map[c_lower] = [c_lower]
                                if c not in filename_case_map[c_lower]:              
                                    filename_case_map[c_lower].append(c)
                                class_list.add(c_lower)
    class_list = list(class_list)
    class_list.sort()

    train_scene_list = ["scene1", "scene2","scene3"] #training scene from the challenge
    test_scene_list = ["scene4"] #val scene from the challenge
    for s in test_scene_list:
        assert s not in train_scene_list
    for s in train_scene_list:
        assert s not in test_scene_list

    train_X = []
    train_y = []
    train_intervals = []
    test_X = []
    test_y = []
    test_intervals = []

    for subject in subject_list:
        print(subject, flush=True)
        for scene in scene_list:
            for session in session_list:
                for label in class_list:
                    acc_w_df = None
                    acc_s_df = None
                    gyro_df = None
                    orientation_df = None                

                    if "acc_phone" in use_sensors: 
                        for label_ in filename_case_map[label]:
                            acc_s_path = os.path.join(trimmed_dir, "acc_phone_clip", subject, scene, session, label_ + ".csv")
                            if os.path.exists(acc_s_path):
                                acc_s_df = load_csv_file_list(csv_file_path_list=[acc_s_path], cols=["time", "x", "y", "z"], prefix="s_acc", sample_rate=100)
                        if acc_s_df is None:
                            continue
                        else:
                            acc_s_df = acc_s_df / 9.8 #converting values from m/s^2 to g's

                    if "acc_watch" in use_sensors: 
                        for label_ in filename_case_map[label]:
                            acc_w_path = os.path.join(trimmed_dir, "acc_watch_clip", subject, scene, session, label_ + ".csv")
                            if os.path.exists(acc_w_path):
                                acc_w_df = load_csv_file_list(csv_file_path_list=[acc_w_path], cols=["time", "x", "y", "z"], prefix="w_acc", sample_rate=100)
                        if acc_w_df is None:
                            continue
                        else:
                            acc_w_df = acc_w_df / 9.8 #converting values from m/s^2 to g's

                    if "gyro" in use_sensors: 
                        for label_ in filename_case_map[label]:
                            gyro_path = os.path.join(trimmed_dir, "gyro_clip", subject, scene, session, label_ + ".csv")
                            if os.path.exists(gyro_path):
                                gyro_df = load_csv_file_list(csv_file_path_list=[gyro_path], cols=["time", "x", "y", "z"], prefix="gyro", sample_rate=50)
                        if gyro_df is None:
                            continue

                    if "orientation" in use_sensors: 
                        for label_ in filename_case_map[label]:
                            orientation_path = os.path.join(trimmed_dir, "orientation_clip", subject, scene, session, label_ + ".csv")
                            if os.path.exists(orientation_path):
                                orientation_df = load_csv_file_list(csv_file_path_list=[orientation_path], cols=["time", "azimuth", "pitch", "roll"], sample_rate=None)
                        if orientation_df is None:
                            continue

                    if acc_s_df is not None:
                        primary_df = acc_s_df
                        secondary_df_list=[acc_w_df, gyro_df, orientation_df]
                        secondary_df_list = [item for item in secondary_df_list if item is not None]

                    elif acc_w_df is not None:
                        primary_df = acc_w_df
                        secondary_df_list=[acc_s_df, gyro_df, orientation_df]
                        secondary_df_list = [item for item in secondary_df_list if item is not None]

                    elif gyro_df is not None:
                        primary_df = gyro_df
                        secondary_df_list=[acc_s_df, acc_w_df, orientation_df]
                        secondary_df_list = [item for item in secondary_df_list if item is not None]

                    elif orientation_df is not None:
                        primary_df = orientation_df
                        secondary_df_list=[acc_s_df, acc_w_df, gyro_df]
                        secondary_df_list = [item for item in secondary_df_list if item is not None]

                    else:
                        raise ValueError()

                    combined_df = datetime_merging(primary_df=primary_df, secondary_df_list=secondary_df_list)

                    all_windows = False
                    variables_first = True
                    sensor_cols = [col for col in combined_df.columns if col != "time"]
                    window_size = 100 #1 second window of 100 Hz data
                    step_size = 25 #75% overlap between windows

                    X, y, intervals, _ = create_windowed_data(df=combined_df, sensor_cols=sensor_cols, window_size=window_size, step_size=step_size, variables_first=variables_first, all_windows=all_windows, label_list=None, constant_label=label.lower())

                    if scene in train_scene_list:
                        train_X.extend(X)
                        train_y.extend(y)
                        train_intervals.extend(intervals)

                    elif scene in test_scene_list:
                        test_X.extend(X)
                        test_y.extend(y)
                        test_intervals.extend(intervals)

    train_val_split = int(0.9*len(train_X))
    temp_train_X = train_X[:train_val_split]
    temp_train_y = train_y[:train_val_split]
    temp_train_intervals = train_intervals[:train_val_split]
    val_X = train_X[train_val_split:]
    val_y = train_y[train_val_split:]
    val_intervals = train_intervals[train_val_split:]
    train_X = temp_train_X
    train_y = temp_train_y
    train_intervals = temp_train_intervals

    train_X = np.array(train_X)
    train_y = np.array(train_y)
    val_X = np.array(val_X)
    val_y = np.array(val_y)
    test_X = np.array(test_X)
    test_y = np.array(test_y)

    print(train_X.shape)
    print(train_y.shape)
    print(len(train_intervals))
    print(val_X.shape)
    print(val_y.shape)
    print(len(val_intervals))
    print(test_X.shape)
    print(test_y.shape)
    print(len(test_intervals))

    # X, y, splits = combine_split_data([train_X, val_X], [train_y, val_y])
    # tfms  = [None, [Categorize()]]
    # dsets = TSDatasets(X, y, tfms=tfms, splits=splits, inplace=True)
    # dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=[1024, 1024], shuffle=True, batch_tfms=[], num_workers=0)


In [6]:
use_sensors = ["acc_phone", "gyro",] #["acc_phone", "acc_watch", "gyro", "orientation",]

trimmed_dir = "/data3/cj/CS6784-Data/original_sensors"

trimmed_data_example(trimmed_dir=trimmed_dir, use_sensors=use_sensors)

trimmed cross scene example
subject1
subject2
Empty csv file found: /data3/cj/CS6784-Data/original_sensors/sensors/acc_phone_clip/subject2/scene4/session5/pushing.csv
subject3
subject4
subject5
subject6
subject7
subject8
subject9
subject10
subject11
subject12
subject13
subject14
subject15
subject16
subject17
subject18
subject19
subject20
(139290, 6, 100)
(139290,)
139290
(15477, 6, 100)
(15477,)
15477
(66395, 6, 100)
(66395,)
66395
