In [1]:
import numpy as np
import pandas as pd

import glob
import os

DATA_NAME = "50salads"
DATA_RATIO = 1

In [2]:
def generate_boundary_labels(label_list, mapping_dict):
    boundary_list = []
    segment_len_list = []
    label_seg_list = []

    for video_label in label_list:
        for class_label, class_name in mapping_dict.items():
            video_label[video_label == class_name] = int(class_label) # change class name into class integer

        label_seg_list.append(np.zeros(len(video_label)))
        boundaries = []
        segment_len = []
        length = 0
        for ind, (prev_label, curr_label) in enumerate(zip(video_label, video_label[1:])):
            length += 1
            if prev_label != curr_label:
                boundaries.append(ind)
                segment_len.append(length)
                length = 0
        if length != 0:
            segment_len.append(length)  # put last segment(no boundary at the last of file)
        if len(boundaries) != len(segment_len)-1:
            segment_len.append(1)
        boundary_list.append(boundaries)
        segment_len_list.append(segment_len)

    for i in range(len(boundary_list)):
        for j in range(len(boundary_list[i])):
            label_seg_list[i][boundary_list[i][j]] = 1
    return label_seg_list

In [3]:
def patchwork(feature_list, label_list, label_seg_list):
        num_file = len(feature_list)
        permuted_file_indices = np.arange(num_file)
        length = 0
        X_long = []
        y_long = []
        y_seg_long = []
        file_boundaries = []
        for i in permuted_file_indices:
            length += len(feature_list[i])
            X_long.append(feature_list[i])
            y_long.append(label_list[i])
            y_seg_long.append(label_seg_list[i])
            file_boundaries.append(length)
        return np.concatenate(X_long, axis=0), np.concatenate(y_long, axis=0), np.concatenate(y_seg_long, axis=0), np.array(file_boundaries, dtype=np.int64)

In [4]:
data_path = 'features'
label_path = 'groundTruth'
label_map_file_name = 'mapping.txt'

feature_file_names = sorted(glob.glob(os.path.join(data_path, "*.npy")))
label_file_names = sorted(glob.glob(os.path.join(label_path, "*.txt")))
mapping_dict = pd.read_csv(label_map_file_name, sep=" ", index_col=None, header=None)[1].to_dict()
feature_list = [np.load(f).transpose() for f in feature_file_names]
label_list = [np.array(pd.read_csv(f, sep=" ", index_col=None, header=None)[0].to_numpy()) for f in
                label_file_names]

label_seg_list = generate_boundary_labels(label_list, mapping_dict)
X_long, y_long, y_seg_long, file_boundaries_indice = patchwork(feature_list, label_list, label_seg_list)
y_seg_long = np.array(generate_boundary_labels([y_long], {})).flatten()

In [5]:
y_long = y_long.astype(np.int32)

In [6]:
np.unique(y_long, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18], dtype=int32),
 array([61363, 14293, 47458, 10495, 47137, 13668, 11309, 22871, 27383,
        13500, 20784, 61567, 48413, 15777, 21230, 24853, 34149, 30001,
        51344]))

Change action start/ action end label as unlabeled region (c=0).

In [7]:
y_long += 1
y_long[y_long==18] = 0
y_long[y_long==19] = 0
print(np.unique(y_long, return_counts=True))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17], dtype=int32), array([81345, 61363, 14293, 47458, 10495, 47137, 13668, 11309, 22871,
       27383, 13500, 20784, 61567, 48413, 15777, 21230, 24853, 34149]))


In [8]:
def stitch_with_change_points(X, y_long):
    # y_long: an class label array with same length of data array
    # generates class segments' start end pair list
    # there could be no class change before and after a un-annotated segment
    # we only treat class change as a change point
    # if two labeled segments surrounding an unlabeled segment have same class then those segments are merged as one segment
    boundary_indice = np.where(y_long[1:]!=y_long[:-1])[0]+1
    boundary_indice = np.concatenate([[0],boundary_indice,[len(y_long)]])
    class_segment_list = [[j-i,int(y_long[i])] for i,j in zip(boundary_indice[:-1], boundary_indice[1:])]
    print(class_segment_list)    
    
    X_annot_stitched = X[y_long!=0]
    y_annot_stitched = y_long[y_long!=0]
    y_annot_stitched -= 1
    boundary_labels_per_ts = np.zeros_like(y_annot_stitched)

    total_length = 0
    prev_c = 0
    for i, (length, c) in enumerate(class_segment_list):
        if c==0 and (0 < i < len(class_segment_list) - 1) and class_segment_list[i-1][1] != class_segment_list[i+1][1]: # if first/last segment is not annotated skip the segment
            boundary_labels_per_ts[total_length-1] = 2
            # print(total_length-1)
        elif c!=0: # class must change comparing to the class of previous segment
            if prev_c != c and boundary_labels_per_ts[total_length-1]==0: # if already labeled as abrupt, then do not label
                boundary_labels_per_ts[total_length-1] = 1
            total_length += length
            prev_c = c
        else:
            # print(i)
            continue
    boundary_labels_per_ts[0]=0 # no change point at the start timestamp
    boundary_labels_per_ts[-1]=0 # no change point at the last timestamp
    
    print(f"number of gradual change points: {np.sum(boundary_labels_per_ts==1)}\nnumber of abrupt change points: {np.sum(boundary_labels_per_ts==2)}")
    return X_annot_stitched, y_annot_stitched, boundary_labels_per_ts
X_annot_stitched, y_annot_stitched, boundary_labels_per_ts = stitch_with_change_points(X_long, y_long)

[[604, 0], [1595, 1], [350, 2], [698, 1], [244, 2], [1837, 3], [218, 4], [978, 5], [834, 6], [289, 7], [767, 8], [507, 9], [304, 10], [1118, 11], [2095, 0], [471, 7], [306, 10], [446, 8], [472, 9], [316, 11], [1558, 3], [1051, 1], [2543, 12], [1034, 13], [484, 14], [99, 2], [97, 14], [697, 5], [268, 6], [478, 15], [422, 16], [592, 17], [1871, 0], [1512, 3], [241, 4], [995, 1], [126, 2], [38, 1], [75, 2], [67, 16], [90, 2], [662, 5], [178, 6], [341, 5], [302, 6], [1624, 12], [681, 13], [81, 14], [239, 13], [310, 14], [648, 8], [310, 9], [169, 7], [303, 10], [831, 16], [587, 17], [2199, 0], [860, 1], [396, 2], [1156, 12], [811, 13], [305, 14], [1019, 3], [415, 4], [839, 5], [299, 6], [419, 16], [212, 8], [266, 9], [181, 7], [265, 10], [441, 11], [460, 17], [965, 0], [3095, 12], [378, 14], [1128, 5], [180, 6], [582, 3], [142, 4], [924, 1], [132, 2], [685, 16], [567, 9], [490, 8], [172, 10], [240, 7], [613, 11], [701, 17], [402, 15], [1036, 0], [570, 9], [404, 8], [181, 10], [225, 7], [553

In [9]:
from sklearn.preprocessing import StandardScaler, RobustScaler
scaler = StandardScaler()
X_annot_stitched = scaler.fit_transform(X_annot_stitched)

In [10]:
print(np.sum(boundary_labels_per_ts==2))
print(np.unique(y_annot_stitched, return_counts=True))

49
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
      dtype=int32), array([61363, 14293, 47458, 10495, 47137, 13668, 11309, 22871, 27383,
       13500, 20784, 61567, 48413, 15777, 21230, 24853, 34149]))


In [11]:
data_length = int(DATA_RATIO*len(X_annot_stitched))
np.save(f"{DATA_NAME}_X_long.npy", X_annot_stitched[:data_length])
np.save(f"{DATA_NAME}_y_long.npy", y_annot_stitched[:data_length])
np.save(f"{DATA_NAME}_cp_long.npy", boundary_labels_per_ts[:data_length]) # gradual change points: 1 / abrupt change points: 2

In [12]:
X_load = np.load(f"{DATA_NAME}_X_long.npy")
y_load = np.load(f"{DATA_NAME}_y_long.npy")
boundary_load = np.load(f"{DATA_NAME}_cp_long.npy")
print(X_load.shape, y_load.shape,boundary_load.shape)
print(np.unique(y_load, return_counts=True))

(496250, 2048) (496250,) (496250,)
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
      dtype=int32), array([61363, 14293, 47458, 10495, 47137, 13668, 11309, 22871, 27383,
       13500, 20784, 61567, 48413, 15777, 21230, 24853, 34149]))
