# <u>T16 HOI HUB</u>

## Import Libraries

In [1]:
from ipywidgets import widgets
from IPython.display import display, HTML
import os
import io
import base64
import json
from tqdm.notebook import trange, tqdm ###

## Data Exploration

In [13]:
video_folder = os.listdir('data/video')

list_of_video = {}
for video in video_folder:
    # remove hidden file especially for macOS
    if not video.startswith('.'):
        list_of_video[video] = video

## Create Caption files

In [14]:
# read action and frame object from file
def readCaptionFile(filename,videoName):
    # Opening JSON file
    f = open(filename)

    # returns JSON object as 
    # a dictionary
    data = json.load(f)

    # Iterating through the json
    # Closing file
    f.close()
    return data[videoName]["actions"]

# convert frame to time
def convertFrameToTime(frame):
    seconds = int(frame/25)
    minutes = "00"
    if seconds >= 60:
        minutes = str(seconds // 60)
        seconds = seconds % 60
    if len(minutes) == 1:
        minutes = "0" + minutes
    seconds = str(seconds)
    #may need handle hour
    if len(seconds) == 1:
        seconds = "0" + seconds 
    return (minutes + ":" + seconds + ".000")

# read reference text from txt file
def readReferenceFile(refFile):
    referenceDict = {}
    with open(refFile) as f:
        lines = f.readlines()
    for i in lines:
        x = i.split()
        referenceDict[str(x[0])] = x[1]
    return referenceDict

# create caption file
def formatCaptionFile(captionList, reference, captionPath):
    start = "WEBVTT\n\n"
    captions = []
    for i in captionList:
        text = reference[str(i[0])]
        lines = convertFrameToTime(i[1]) + " --> " + convertFrameToTime(i[2]) + "\n" + text + "\n\n"
        captions.append(lines)
    f = open(captionPath, "w")
    f.write(start)
    f.writelines(captions)
    f.close()

In [15]:
video_dropdown = widgets.Dropdown(
    options = list_of_video,
    description = 'Videos',
)

video_src = 'data/video/' + video_dropdown.value

def play_video(video_src,caption_src):
    video = io.open(video_src, 'r+b').read()
    encoded = base64.b64encode(video)
    return(HTML(data='''<video width="650" height="360" controls>
        <source src="data:video/mp4;base64,{0}" type="video/mp4" />
        <track kind="captions" src={1} srclang="en" label="English" default>
        </video>'''.format(encoded.decode('ascii'),caption_src)))


# video dropdown onchange function
def video_on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        change['new']
        global video_src 
        video_src = 'data/video/'+ video_dropdown.value
        
          
# display video dropdown
video_dropdown.observe(video_on_change)
display(video_dropdown) 


Dropdown(description='Videos', options={'P02T01C07.vtt': 'P02T01C07.vtt', 'P02T01C06.vtt': 'P02T01C06.vtt', 'P…

In [8]:
# caption
videoName = video_dropdown.value.split(".")
#location of reference are place at root
ref = readReferenceFile('all_labels.txt')
# may need change the caption path to dynamic
captionPath = "data/video/" + videoName[0] + ".vtt"
# model result file should be some directory, here using root 
captionList = readCaptionFile('smarthome_CS_51.json',videoName[0])
formatCaptionFile(captionList,ref,captionPath)


video = video_src.split('/')[-1]
print("Currently playing : " + video)
play_video(video_src, captionPath)

## Extract i3D features from video

In [2]:
import models

In [3]:
from pathlib import Path
from omegaconf import OmegaConf

In [4]:
# after the desired videos are selected
video_paths: list[str] = ["../data/RGB_Video_MP4/P02T01C06.mp4"]

In [5]:
i3d_defaults = OmegaConf.load(Path("feature_extractor/configs/i3d.yml"))
i3d_config = OmegaConf.merge(i3d_defaults, OmegaConf.create({
    "feature_type": "i3d",
    "streams": "rgb",
    "output_path": "/media/starlight/2c72c05a-ec96-4c96-ba3c-50ae4bc6730b/home/starlight/TSU/data/RGB_i3d_test",
    "video_paths": video_paths,
    "on_extraction": "save_numpy",
    "stack_size": 16,
    "step_size": 16
}))
extractor = models.ExtractI3D(i3d_config)

In [7]:
for video in tqdm(video_paths, desc="videos extracted"):
    extractor._extract(video)

  0%|          | 0/1 [00:00<?, ?it/s]

## Inference

In [12]:
import torch
# Example: to be modified
# lets say TSU has been selected
from TSU_PDAN import HOI_PDAN
from HOI.smarthome_i3d_per_video import TSU as Dataset
from HOI.smarthome_i3d_per_video import TSU_collate_fn as collate_fn

Random_SEED!!!: 0


In [3]:
# TODO: v-iashin
# lets say TSU smarthome is then selected (in practice, a custom json is generated depending on what specific videos are selected)
val_dataset = Dataset("HOI/data/smarthome_CS_51.json", 'testing', "../data/RGB_i3d_16frames_64000_SSD", 1, 51)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=True, num_workers=4,
                                                pin_memory=True, collate_fn=collate_fn)
# val_dataloader.root = args.rgb_root

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 536/536 [00:05<00:00, 95.44it/s]


In [4]:
modelrunner = HOI_PDAN()
modelrunner.PDAN_training_parameters()
modelrunner.model.load_state_dict(torch.load("HOI/PDAN/weight_epoch_0.0002_35"))

  init.kaiming_normal(self.key_conv.weight, mode='fan_out')
  init.kaiming_normal(self.value_conv.weight, mode='fan_out')
  init.kaiming_normal(self.query_conv.weight, mode='fan_out')
  init.normal(self.rel_t, 0, 1)


<All keys matched successfully>

In [5]:
result = None
# note: this doesn't appear to show up properly in vscode
with tqdm(val_dataloader, unit='batch') as progressive_loader:
    result = modelrunner.evaluate(progressive_loader)
result

  0%|          | 0/185 [00:00<?, ?batch/s]

  rg = torch.range(1, self.scores.size(0)).float()
  ap[k] = precision[truth.byte()].sum() / max(truth.sum(), 1)


val-map: tensor(34.1454)
tensor([46.0981, 53.4290, 44.6538, 51.1127, 43.1801, 46.4834, 37.1899, 31.5516,
        42.9660, 21.3682, 17.6236, 13.5470,  1.1666, 41.1923, 49.6549,  3.8188,
        19.1954, 59.6317,  5.5071, 71.6044, 36.5219, 74.4247, 54.4266,  3.0952,
         0.8215, 62.6654, 35.2815, 45.3404, 59.1798, 16.6004,  7.4304,  0.3018,
        33.3252,  3.7282, 43.4654, 88.7000, 37.3753, 18.4837, 25.2225,  5.6187,
        71.5390, 33.2785, 13.1370, 62.3055,  3.1802, 80.0095, 72.6115, 29.4839,
        18.2610,  3.3478,  1.2794])


tensor(34.1454)

## Training HOI ML Model

In [8]:
train_dataset = Dataset("HOI/data/smarthome_CS_51.json", 'training', "../data/RGB_i3d_16frames_64000_SSD", 1, 51)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=4,
                                                pin_memory=True, collate_fn=collate_fn)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 536/536 [00:26<00:00, 19.90it/s]


In [14]:
with trange(0,5, unit='epoch', desc='epochs') as epoch_range:
    with tqdm(train_dataloader, unit='batch', desc='training') as train_loader:
        with tqdm(val_dataloader, unit='batch', desc='validating') as val_loader:
            [model for model in \
             modelrunner.train(train_dataloader=train_loader, val_dataloader=val_loader, epoch_range=epoch_range)]

  0%|          | 0/5 [00:00<?, ?epoch/s]

training:   0%|          | 0/351 [00:00<?, ?batch/s]

validating:   0%|          | 0/185 [00:00<?, ?batch/s]

KeyboardInterrupt: 

## Evaluate