In [2]:
%cd /home/asehgal/neurosym-lib
from pathlib import Path
import numpy as np

/home/asehgal/neurosym-lib


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


### Fly-vs.-Fly dataset
The Fly-vs.-Fly benchmark [14] tracks a pair of fruit flies and the actions they perform while interacting across multiple contexts.
Each timestep is encoded as a 53-dimensional feature vector that combines 17 pose/orientation attributes with 36 position-invariant descriptors such as linear and angular velocities.
For the NEAR experiments we focus on bout-level classification over the Aggression and Boy-meets-Boy subsets, predicting one of seven aggressive, threatening, or neutral behaviors per trajectory.
Trajectories are capped at 300 timesteps, and any sequence longer than this limit is split into multiple segments sharing the same action label to balance the data.
The resulting splits contain 5339 training trajectories, 594 validation trajectories, and 1048 test trajectories with an average length of 42.06 timesteps.


In [3]:
# Collate Fly-vs.-Fly sequences into fixed-length tensors and export train/val/test splits.
fruitflies_folder = Path("./data/fruitflies_classification/fly_process/")
output_folder = fruitflies_folder.parent / "flyvfly"
output_folder.mkdir(parents=True, exist_ok=True)

MAX_LEN = 300
FEATURE_DIM = 53

def _ensure_2d(sequence):
    sequence = np.asarray(sequence, dtype=np.float32)
    if sequence.ndim != 2 or sequence.shape[1] != FEATURE_DIM:
        raise ValueError(f"Expected sequence with shape (T, {FEATURE_DIM}), got {sequence.shape}")
    return sequence

def split_pad_sequence(sequence):
    sequence = _ensure_2d(sequence)
    segments = []
    lengths = []
    total_len = sequence.shape[0]
    for start in range(0, total_len, MAX_LEN):
        chunk = sequence[start : start + MAX_LEN]
        actual_len = chunk.shape[0]
        if actual_len == 0:
            continue
        if actual_len < MAX_LEN:
            pad_frame = chunk[-1:, :]
            padding = np.repeat(pad_frame, MAX_LEN - actual_len, axis=0)
            chunk = np.concatenate([chunk, padding], axis=0)
        segments.append(chunk.astype(np.float32, copy=False))
        lengths.append(actual_len)
    if not segments:
        segments.append(np.zeros((MAX_LEN, FEATURE_DIM), dtype=np.float32))
        lengths.append(0)
    return segments, lengths

for split in ["train", "val", "test"]:
    raw_data = np.load(fruitflies_folder / f"{split}_data.npy", allow_pickle=True)
    raw_labels = np.load(fruitflies_folder / f"{split}_label.npy", allow_pickle=True)
    assert len(raw_data) == len(raw_labels)
    collated_data = []
    collated_labels = []
    collated_lengths = []
    for seq, label in zip(raw_data, raw_labels):
        segments, lengths = split_pad_sequence(seq)
        collated_data.extend(segments)
        label_value = int(np.asarray(label).item())
        collated_labels.extend([label_value] * len(segments))
        collated_lengths.extend(lengths)

    stacked = np.stack(collated_data, axis=0)
    labels = np.asarray(collated_labels, dtype=np.int64).reshape(-1, 1)
    lengths = np.asarray(collated_lengths, dtype=np.int32)

    np.savez_compressed(output_folder / f"{split}_flyvfly_data.npz", stacked)
    np.savez_compressed(output_folder / f"{split}_flyvfly_labels.npz", labels)
    # np.savez_compressed(output_folder / f"{split}_flyvfly_lengths.npz", lengths)

    print(f"{split}: data {stacked.shape} (dtype={stacked.dtype}), labels {labels.shape}, n-unique labels {len(np.unique(labels))}")


train: data (5341, 300, 53) (dtype=float32), labels (5341, 1), n-unique labels 7
val: data (629, 300, 53) (dtype=float32), labels (629, 1), n-unique labels 7
test: data (1050, 300, 53) (dtype=float32), labels (1050, 1), n-unique labels 7


In [4]:
# get the basketball data in the correct format.
bball_folder = Path("./data/basketball_classification/bball_process/")
map_types = {
    "data" : "data",
    "label" : "labels",
    "labels" : "labels",
}

for file_path in sorted(bball_folder.glob("*.npy")):
    split, typ = file_path.stem.replace("_basket", "").split("_")
    typ = map_types[typ]
    save_path = bball_folder.parent / "bball" / f"{split}_bball_{typ}.npz"
    save_path.parent.mkdir(parents=True, exist_ok=True)
    if typ == 'data':
        data = np.load(file_path, allow_pickle=True).astype(np.float32)
        print(save_path.stem, data.shape, data.dtype)
        np.savez_compressed(save_path, data)
    elif typ in "labels":
        labels = np.load(file_path, allow_pickle=True).astype(np.int64)
        print(save_path.stem, labels.shape, len(np.unique(labels)), labels.dtype)
        np.savez_compressed(save_path, labels)


test_bball_data (2693, 25, 22) float32
test_bball_labels (2693, 25, 1) 6 int64
train_bball_data (18000, 25, 22) float32
train_bball_labels (18000, 25, 1) 6 int64
valid_bball_data (2801, 25, 22) float32
valid_bball_labels (2801, 25, 1) 6 int64


In [5]:
calms21 = Path("data") / "mice_classification" / "calms21_task1"

np.load(calms21 / "train_data.npy", allow_pickle=True).shape, np.load(calms21 / "train_investigation_labels.npy", allow_pickle=True).shape


((5000, 13, 18), (5000, 1))

In [7]:
crim13 = Path("data") / "mice_classification" / "crim13"

np.load(crim13 / "train_crim13_data.npy", allow_pickle=True).shape, np.load(crim13 / "train_crim13_labels.npy", allow_pickle=True).shape

((12404, 100, 19), (12404, 100))

In [13]:
np.load(crim13 / "train_crim13_data.npy", allow_pickle=True).shape, np.load(crim13 / "train_crim13_labels_other.npy", allow_pickle=True).shape

((12404, 100, 19), (12404, 100))