# Intention Dataset Split


## Imports


In [1]:
from json import load, dump
from collections import defaultdict
from random import shuffle, seed

## Inputs

This is the only part that must be changed between runs. Values here are the default.

In [2]:
# Arbitrary but fixed number for random.shuffle
seed_number = 1234

# Per Intention (100 samples * 8 intentions = 800 training samples)
train_size = 100 

# Main source of annotation. Entire code is very format dependent
dataset_path = "../dataset"
video_dataset_path = f"{dataset_path}/VIDEOS"


label_filepath = f"./intent_ann_new.json"
save_filepath = f"./data/intent_dataset.json"

## Process
Prerequisite functions that describe the entire splitting process.

In [3]:
def intention_sorted_gazeseq(label_filepath):    
    with open(label_filepath) as dataset:
        dataset = load(dataset)
        labels = dataset["hiphop"]["videos"]

    # Group samples into a dictionary with intentions as keys and values 
    # as list of samples with said intention
    samples = defaultdict(list)

    for _, label in labels.items():
        samples[label["intent"]].append(label["gaze_seq"])

    print("Grouping Label Dataset by Intention")

    label_items = sum(len(sequence) for sequence in samples.values())

    print(f"Label Set ({label_items} items)")
    for intent, sequences in samples.items():
        items = len(sequences)
        print(f"\t{items} items ({items/label_items:2.2%}):\t{intent}")

    return samples

In [4]:
def split(samples, seed_number, train_size):
    seed(seed_number)

    training_set = defaultdict(list)
    testing_set = defaultdict(list)

    for intent in samples:
        shuffle(samples[intent])
        training_set[intent] = samples[intent][:train_size]
        testing_set[intent] = samples[intent][train_size:]

    print(f"Splitting Label Dataset (seed={seed_number})")

    train_items = sum(len(sequence) for sequence in training_set.values())
    test_items = sum(len(sequence) for sequence in testing_set.values())

    print(f"Training Set ({train_items} items)")
    for intent, sequences in training_set.items():
        items = len(sequences)
        print(f"\t{items} items ({items/train_items:2.2%}):\t{intent}")

    print(f"Testing Set ({test_items} items)")
    for intent, sequences in testing_set.items():
        items = len(sequences)
        print(f"\t{items} items  ({items/test_items:2.2%}):\t{intent}")

    return training_set, testing_set

In [5]:
def save_intention_dataset(training_set, testing_set, save_filepath):
    # HIPHOP Intention Dataset Format
    dataset = {
        "hiphop": {
            "intentions": {
                "train": training_set,
                "test": testing_set,
            }
        }
    }

    # Save dataset
    with open(save_filepath, "w") as intention_dataset:
        dump(dataset, intention_dataset)

## Generate
Run cell to generate an intention split in json format. Note that any existing intention split will be overwritten.

In [None]:
samples = intention_sorted_gazeseq(label_filepath)
training_set, testing_set = split(samples, seed_number, train_size)
save_intention_dataset(training_set, testing_set, save_filepath)

print("Finished")

Grouping Label Dataset by Intention
Label Set (1000 items)
	127 items (12.70%):	Spontaneous
	120 items (12.00%):	Drink
	127 items (12.70%):	Study
	115 items (11.50%):	Go Outside
	132 items (13.20%):	Eat
	135 items (13.50%):	Indeterminate
	124 items (12.40%):	Clean the Area
	120 items (12.00%):	Rest
Splitting Label Dataset (seed=1234)
Training Set (800 items)
	100 items (12.50%):	Spontaneous
	100 items (12.50%):	Drink
	100 items (12.50%):	Study
	100 items (12.50%):	Go Outside
	100 items (12.50%):	Eat
	100 items (12.50%):	Indeterminate
	100 items (12.50%):	Clean the Area
	100 items (12.50%):	Rest
Testing Set (200 items)
	27 items  (13.50%):	Spontaneous
	20 items  (10.00%):	Drink
	27 items  (13.50%):	Study
	15 items  (7.50%):	Go Outside
	32 items  (16.00%):	Eat
	35 items  (17.50%):	Indeterminate
	24 items  (12.00%):	Clean the Area
	20 items  (10.00%):	Rest
Finished


## Uniform Split

Once the intent and gaze JSON file for the train-test splits are created, the intent dataset can use the same split used by the gaze dataset with the code below. C

### Inputs


In [6]:
gaze_dataset_filepath = "./data/gaze_dataset.json"
save_filepath_uniform = "./data/intent_dataset_uniform.json"

### Generate


In [None]:
def extract(sample):
    return sample["video"].split("/")[-1]


def main():
    # Dynamically load source of
    with open(save_filepath) as dataset:
        intentions = load(dataset)["hiphop"]["intentions"]["train"].keys()

    # Source of distribution
    with open(gaze_dataset_filepath) as dataset:
        gaze_dataset = load(dataset)["hiphop"]["gaze"]

    train_filenames = [extract(sample) for sample in gaze_dataset["train"]]
    test_filenames = [extract(sample) for sample in gaze_dataset["test"]]

    # Source of gaze sequence
    with open(label_filepath) as dataset:
        intent_dataset = load(dataset)["hiphop"]["videos"]

    # Build new intent dataset

    # Train Set
    train_set = {intention: [] for intention in intentions}
    for filename in train_filenames:
        gaze = intent_dataset[filename]["gaze_seq"]
        intention = intent_dataset[filename]["intent"]

        train_set[intention].append(gaze)

    # Test Split
    test_set = {intention: [] for intention in intentions}
    for filename in test_filenames:
        gaze = intent_dataset[filename]["gaze_seq"]
        intention = intent_dataset[filename]["intent"]

        test_set[intention].append(gaze)

    unified_dataset = {
        "hiphop": {"intentions": {"train": train_set, "test": test_set}}
    }

    with open(save_filepath_uniform, "w") as intent_dataset_file:
        dump(unified_dataset, intent_dataset_file)

main()