## Connect
Connect to Google Drive and ensure dataset folder is present

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd "/content/gdrive/Shareddrives/ECE 199 2s2223/Dataset/"

Mounted at /content/gdrive
/content/gdrive/Shareddrives/ECE 199 2s2223/Dataset


## Imports


In [None]:
from json import dump, load, loads
from collections import Counter
from itertools import chain
from numpy.random import choice, seed
from math import log10
from itertools import product
from os import listdir
from os.path import join

## Inputs
This is the only part that must be changed between runs. Values here are the default.

In [None]:
# Dataset Constants
PARTICIPANT_COUNT = 20
VIDEO_PER_PARTICIPANT = 50

# Arbitrary but fixed number for random.shuffle
seed_number = 1234

# Total fixed number of samples dedicated for training
train_size = 800 

# Main source of annotation. Entire code is very format dependent
dataset_path = "/content/gdrive/Shareddrives/ECE 199 2s2223/Dataset"

## Dataset Subdirectories (in Google Drive)
depth_dataset_path = f"{dataset_path}/Final/DEPTHS"
video_dataset_path = f"{dataset_path}/Final/VIDEOS"

# Video-Intent-Gaze JSON source
label_filepath = f"{dataset_path}/intent_ann_new.json"

# Output Filepath
save_filepath = f"{dataset_path}/Splits/gaze_dataset.json"

## Process
Prerequisite functions that describe the entire splitting process.

### Filepath Functions

In [None]:
def participant_directories(format="", root=""):
    video_range = range(1, VIDEO_PER_PARTICIPANT + 1)
    participant_range = range(1, PARTICIPANT_COUNT + 1)

    for participant, video in product(participant_range, video_range):
        directory =  join(
            root, 
            f"P{participant}/V{video}", 
            f"P{participant}_V{video}.{format}" if format else "")
        yield directory, participant, video

In [None]:
def filename(root, participant, video, format=""):
    return f"{root}/P{participant}/V{video}/P{participant}_V{video}.{format}"

In [None]:
def format_filename(root, filepath):
    filename = filepath.split('.')[0]
    participant, video = filename.split('_')[:2]
    participant, video =int( participant[1:]), int(video[1:])

    return f"{root}/P{participant}/V{video}/{filepath}"

### Depth Inclusion

In [None]:
def depth_files(directory):
    return [format_filename("DEPTHS", path) for path in listdir(directory)]

In [None]:
def video_depth_mapping():
    directories = participant_directories(
        root=depth_dataset_path)

    return { filename("VIDEOS", p, v, 'mp4'): depth_files(d)
            for d, p, v in directories }

### Bounding Box Inclusion

In [None]:
def box_format(item):
    return [
        int(item["classifications"] != []),
        item["bbox"]["top"], 
        item["bbox"]["left"], 
        item["bbox"]["width"], 
        item["bbox"]["height"]
    ]

In [None]:
def bounding_boxes(label_filepath):
    with open(label_filepath) as labels:
        frames = [loads(label) for label in labels]

    return [ { item["title"]: box_format(item) 
        for item in frame["objects"] }
        for frame in frames ]

In [None]:
def video_label_mapping():
    directories = participant_directories(
        root=video_dataset_path,
        format="ndjson")

    return {filename("VIDEOS", p, v, 'mp4'): bounding_boxes(d) 
        for d, p, v in directories}

### Splitting Mechanics

In [None]:
def weighted_object_dataset(label_filepath):
    with open(label_filepath) as dataset:
        dataset = load(dataset)

    print("Loading Gaze Dataset")

    # Reformat video mapping into list of objects
    labels = dataset["hiphop"]["videos"]

    # Maps the list of PNG depth filepaths for a given video
    video_depth_map = video_depth_mapping()
    video_label_map = video_label_mapping()

    # Sample Format
    samples = [
        {
            "video": format_filename("VIDEOS", video),
            "gaze_seq": label["gaze_seq"],
            "bbox": video_label_map[format_filename("VIDEOS", video)],
            "depth": video_depth_map[format_filename("VIDEOS", video)],
        }
        for video, label in labels.items()
    ]

    return samples

In [None]:
def low_frequency_weighting(samples):
    print("Calculating weights")

    gaze_sequences = [sample["gaze_seq"] for sample in samples]
    object_counter = Counter(chain.from_iterable(gaze_sequences))
    total = sum(1 / log10(count) for count in object_counter.values())

    weight = {
        key: (1 / log10(count)) / (total * object_counter[key])
        for key, count in object_counter.items()
    }
    weights = [sum(weight[x] for x in gaze) for gaze in gaze_sequences]

    return weights

In [None]:
def split(samples, seed_number, train_size):
    seed(seed_number)

    print(f"Splitting Label Dataset (seed={seed_number})")

    weights = low_frequency_weighting(samples)
    
    training_set = choice(samples, size=train_size, p=weights, replace=False)
    training_set = list(training_set)

    testing_set = [sample for sample in samples if sample not in training_set]

    train_gazes = [sample["gaze_seq"] for sample in training_set]
    test_gazes = [sample["gaze_seq"] for sample in testing_set]

    train_counter = Counter(chain.from_iterable(train_gazes))
    test_counter = Counter(chain.from_iterable(test_gazes))

    total = sum(train_counter.values()) + sum(test_counter.values())

    print(f"Train | Test Split ({len(training_set)} | {len(testing_set)})")

    for object in train_counter:
        train_percent = train_counter[object]/total
        test_percent = test_counter[object]/total
        print(
            f"\t{train_percent:.5%}\t|  {test_percent:.5%}\t | {object}"
        )

    return training_set, testing_set

In [None]:
def save_gaze_dataset(training_set, testing_set, save_path):
    # HIPHOP Gaze Dataset Format
    dataset = {
        "hiphop": {
            "gaze": {
                "train": training_set,
                "test": testing_set,
            }
        }
    }

    # Save dataset
    with open(save_path, "w") as gaze_dataset:
        dump(dataset, gaze_dataset)

## Generate

Run cell to generate a gaze split in json format. Note that any existing gaze split will be overwritten.

In [None]:
samples = weighted_object_dataset(label_filepath)
training_set, testing_set = split(samples, seed_number, train_size)
save_gaze_dataset(training_set, testing_set, save_filepath)

Loading Gaze Dataset
Splitting Label Dataset (seed=1234)
Calculating weights
Train | Test Split (800 | 200)
	6.12351%	|  1.58252%	 | Broom
	6.90408%	|  2.41266%	 | Pillow
	6.84624%	|  1.08579%	 | Book
	3.78132%	|  0.40146%	 | Cup
	4.79373%	|  0.58081%	 | Laptop
	5.88049%	|  0.47485%	 | Fruits
	6.69994%	|  1.34388%	 | Bottle
	12.41567%	|  8.68198%	 | none
	3.01145%	|  0.12248%	 | Sandwich
	4.32179%	|  0.66246%	 | Rug
	5.53833%	|  0.53366%	 | Racket
	4.52252%	|  1.02796%	 | Umbrella
	2.84960%	|  0.31057%	 | Bag
	2.81995%	|  0.14727%	 | Bowl
	1.85956%	|  0.03791%	 | Utensils
	2.14534%	|  0.08020%	 | Chair
