# VELOCITI: Data Exploration Notebook

Goal of this notebook is to provide a data interaction interface, for various tests in the benchmark.

### Pre-Requisites
To be able to interact with this notebook, the dataset including the videos, and the provided `.JSON` files are required.

Ensure That the Following Directories Exist

- `videos/` containing 
  - Folder `velociti_videos_10s` (containing $900$  `.mp4` files).
  - Folder `velociti_videos_4s` (containing $900$ sub-folders with 2 `.mp4` files in each sub-folder)

- `data/` containing

```
  ├── action_adv.json
  ├── action_bind.json
  ├── action_mod.json
  ├── agent_bind.json
  ├── agent_iden.json
  ├── control.json
  ├── coref.json
  ├── pos_caps.json
  ├── sequence.json
  └── vidsitu_dict.json
```
  

### Imports

In [1]:
import json
from os.path import join
from IPython.display import HTML, display
from torch.utils.data import Dataset
from easydict import EasyDict as edict

In [2]:
data_dict = {
        "vidsitu_dict_path"  : f'data/vidsitu_dict.json',
        "videos_path"        : f'videos/',
        "agent_iden"        : f'data/agent_iden.json',
        "agent_bind"         : f'data/agent_bind.json',
        "action_adv"         : f'data/action_adv.json',
        "action_mod"         : f'data/action_mod.json',
        "coref"              : f'data/coref.json',
        "seq"                : f'data/sequence.json',
        "action_bind"        : f'data/action_bind.json'}

# 1. IVAT: Intra-Video Association Test Samples

#### Helper Code

In [3]:
class ivatDataset(Dataset):
    """
    Dataset class for event-caption-matching task.
    """
    def __init__(self, data_dict):
        """
        data_dict -> dict : containing paths of all data.
        """
        
        self.data_dict = edict(data_dict)
        self.vidsitu_dict = json.load(open(self.data_dict.vidsitu_dict_path, 'r'))
        
        self.videos_path = join(self.data_dict.videos_path, 'velociti_videos_4s')
        self.vid_list = list(self.vidsitu_dict.keys())
        
    def __getitem__(self, idx):
        vid_name = self.vid_list[idx]
        
        pos_cap_ev1 = self.vidsitu_dict[vid_name]['Ev1']['pos']
        vid_name_ev1 = join(self.videos_path, vid_name, vid_name + "_p1.mp4")
        
        pos_cap_ev5 = self.vidsitu_dict[vid_name]['Ev5']['pos']
        vid_name_ev5 = join(self.videos_path, vid_name, vid_name + "_p2.mp4")
        
        return {
            'vid_name': vid_name,
            'event': '(Ev1, Ev5)',
            "vid_part_1_path": vid_name_ev1,
            "vid_part_2_path": vid_name_ev5,
            "pos_cap_ev1": pos_cap_ev1,
            "pos_cap_ev5": pos_cap_ev5
        }

    def __len__(self):
        return len(self.vid_list)
    

class negDataset(Dataset):
    """
    Make this return all possibilities of captions.
    Dataset for video-to-text tasks.
    """
    def __init__(self, data_dict, neg_sampling):
        
        self.data_dict = edict(data_dict)
        self.neg_sampling = neg_sampling
        self.videos_path = join(self.data_dict.videos_path, 'velociti_videos_10s')
                
        if self.neg_sampling == 'agent_iden':
            self.ev_data = json.load(open(self.data_dict.agent_iden, 'r'))
            vid_list = list(self.ev_data.keys())
        
        elif self.neg_sampling == 'agent_bind':
            self.ev_data = json.load(open(self.data_dict.agent_bind, 'r'))
            vid_list = list(self.ev_data.keys())
        
        elif self.neg_sampling == 'action_adv':
            self.ev_data = json.load(open(self.data_dict.action_adv, 'r'))
            vid_list = list(self.ev_data.keys())
        
        elif self.neg_sampling == 'action_mod':
            self.ev_data = json.load(open(self.data_dict.action_mod, 'r'))
            vid_list = list(self.ev_data.keys())
        
        elif self.neg_sampling == 'action_bind':
            self.ev_data = json.load(open(self.data_dict.action_bind, 'r'))
            vid_list = list(self.ev_data.keys())
        
        elif self.neg_sampling == 'coref':
            self.ev_data = json.load(open(self.data_dict.coref, 'r'))
            vid_list = list(self.ev_data.keys())
        
        elif self.neg_sampling == 'sequence':
            self.ev_data = json.load(open(self.data_dict.seq, 'r'))
            vid_list = list(self.ev_data.keys())
            

        self.vid_ev_list = []
        for vid in vid_list:
            for ev in self.ev_data[vid]:
                self.vid_ev_list.append((vid,ev))
                
    def __getitem__(self, idx):
        vid_name, ev = self.vid_ev_list[idx]
        vid_path = join(self.videos_path, vid_name + ".mp4")
        pos_cap = self.ev_data[vid_name][ev]['pos']
        neg_cap = self.ev_data[vid_name][ev]['neg']
        
        data = {
            'vid_name': vid_name,
            'vid_path': vid_path,
            'event': ev,
            "pos_cap": pos_cap,
            "neg_cap": neg_cap }
        return data

    def __len__(self):
        return len(self.vid_ev_list)

def display_videos_with_captions_2t2v(data):
    # HTML template for displaying video and caption
    template = '''
    <div style="margin-bottom: 20px; padding: 10px;">
        <video width="640" height="480" controls>
          <source src="{video_path}" type="video/mp4">
        Your browser does not support the video tag.
        </video>
        <div style="background-color: #FFF44F; color: black; font-size: 18px; padding: 5px; margin-top: 5px;">
        <p style="font-size: 24px;">{caption}</p>
        </div>
    </div>
    '''

    html_content = ""  # Initialize an empty string to aggregate HTML content

    # Append each video and caption to the HTML content
    
    video1_html = template.format(video_path=data['vid_part_1_path'], caption=data['pos_cap_ev1'])
    video2_html = template.format(video_path=data['vid_part_2_path'], caption=data['pos_cap_ev5'])
    html_content += video1_html + video2_html  # Aggregate HTML

    # Display the aggregated HTML content in the Jupyter notebook
    display(HTML(html_content))
    

def display_video_with_captions_2t1v(data):
    # HTML template for displaying video and two captions with different background colors
    template = '''
    <div style="margin-bottom: 20px; padding: 10px;">
        <video width="640" height="480" controls>
          <source src="{video_path}" type="video/mp4">
        Your browser does not support the video tag.
        </video>
        <div style="background-color: #90EE90; color: black; font-size: 24px; padding: 10px; margin-top: 10px;">
            {pos_caption}
        </div>
        <div style="background-color: #FF6347; color: black; font-size: 24px; padding: 10px; margin-top: 10px;">
            {neg_caption}
        </div>
    </div>
    '''

    # Generate the video and captions HTML content using the template
    video_html = template.format(video_path=data['vid_path'], pos_caption=data['pos_cap'], neg_caption=data['neg_cap'])

    # Display the HTML in the Jupyter notebook
    display(HTML(video_html))

# Example function usage:
ivat_data = ivatDataset(data_dict)
action_adv_data = negDataset(data_dict, 'action_adv')
action_mod_data = negDataset(data_dict, 'action_mod')
agent_bind_data = negDataset(data_dict, 'agent_bind')
agent_ident_data = negDataset(data_dict, 'agent_iden')
action_bind_data = negDataset(data_dict, 'action_bind')
agent_coref_data = negDataset(data_dict, 'coref')
seq_data = negDataset(data_dict, 'sequence')

def browse_ivat_sample(sample_idx, dataset=ivat_data):
    sample = dataset.__getitem__(sample_idx)
    print(f"Video Name: SAMPLE INDEX {sample_idx}")
    display_videos_with_captions_2t2v(sample)

def browse_neg_sample(sampling, sample_idx, dataset=action_adv_data):
    if sampling == 'action_adv':
        dataset = action_adv_data
    elif sampling == 'action_mod':
        dataset = action_mod_data
    elif sampling == 'agent_bind':
        dataset = agent_bind_data
    elif sampling == 'agent_iden':
        dataset = agent_ident_data
    elif sampling == 'action_bind':
        dataset = action_bind_data
    elif sampling == 'coref':
        dataset = agent_coref_data
    elif sampling == 'sequence':
        dataset = seq_data
        
    sample = dataset.__getitem__(sample_idx)
    print(f"Video Name: {sample['vid_name']} | SAMPLE INDEX {sample_idx}")
    display_video_with_captions_2t1v(sample)


# Start Browsing
### Any Index from `[0, 719)` is Valid.

In [4]:
browse_ivat_sample(30)

Video Name: SAMPLE INDEX 30


In [4]:
browse_ivat_sample(50)

Video Name: SAMPLE INDEX 50


# 2. Action Adversarial Test Samples
### Any Index from `[0, 500)` is Valid.

In [5]:
browse_neg_sample('action_adv', sample_idx=20)

Video Name: v_1cCEE8-jhus_seg_60_70 | SAMPLE INDEX 20


# 3. Action Modifier Test Samples
### Anything from `[0, 514)` is a valid index.

In [6]:
browse_neg_sample('action_mod', sample_idx=22)

Video Name: v_3WCcFVnEKh0_seg_60_70 | SAMPLE INDEX 22


In [7]:
browse_neg_sample('action_mod', sample_idx=513)

Video Name: v_zz0w0KDwhWg_seg_65_75 | SAMPLE INDEX 513


# 4. Agent Binding Test Samples
### Anything from `[0, 1676)` is a valid index.

In [8]:
browse_neg_sample('agent_bind', sample_idx=11)

Video Name: v_-vE1JNGKvxQ_seg_15_25 | SAMPLE INDEX 11


# 5. Agent Identification Test
### Anything in range `[0, 1000)` is a valid index.

In [9]:
browse_neg_sample('agent_iden', sample_idx=10)

Video Name: v_05-e-YTw4r8_seg_110_120 | SAMPLE INDEX 10


In [16]:
browse_neg_sample('agent_iden', sample_idx=999)

Video Name: v_zz0w0KDwhWg_seg_85_95 | SAMPLE INDEX 999


# 6. Action Binding Test
### Anything from `[0, 1625)` is a valid index.

In [10]:
browse_neg_sample('action_bind', sample_idx=11)

Video Name: v_Vc_4eoSHwK8_seg_25_35 | SAMPLE INDEX 11


In [18]:
browse_neg_sample('action_bind', sample_idx=1624)

Video Name: v_S-2cloMm4Lk_seg_95_105 | SAMPLE INDEX 1624


# 7. Agent Co-Rerence Test Samples
### Anything from `[0, 418)` is a valid index.

In [11]:
browse_neg_sample('coref', sample_idx=11)

Video Name: v_bZq6Gv7rP0w_seg_65_75 | SAMPLE INDEX 11


In [20]:
browse_neg_sample('coref', sample_idx=417)

Video Name: v_MnosttqGIfw_seg_145_155 | SAMPLE INDEX 417


# 8. Chronology Test 
### Anything from [0, 1908) is a valid index.

In [14]:
browse_neg_sample('sequence', sample_idx=32)

Video Name: v_t1TC-pegncQ_seg_60_70 | SAMPLE INDEX 32


In [22]:
browse_neg_sample('sequence', sample_idx=1907)

Video Name: v_S-2cloMm4Lk_seg_95_105 | SAMPLE INDEX 1907
