In [23]:
from IPython.display import Video
import json
import os
import pandas as pd
import numpy as np
import h5py
import torch

pd.set_option("display.max_colwidth",1000)

def load_json(path):
    with open(path) as f:
        r = json.load(f)
    return r

def read_csv(path):
    data = pd.read_csv(path)
    data = data.loc[:, ['video','question','answer','qid','type','a0','a1','a2','a3','a4', 'width', 'height']]
    
    return data

def get_wrong_ans(file):
    wrong_qns = []
    result_file = load_json(file)
    for key in result_file.keys():
        if result_file[key]['prediction'] != result_file[key]['answer']:
            vid, qid = key.split('_')
            predicted_ans = result_file[key]['prediction']
            wrong_qns.append({'vid':int(vid), 
                              'qid':int(qid), 
                              'predicted_ans':predicted_ans})
    return wrong_qns

def get_video_qns(csv_data, map_vid, vid, qid):
    video_root = "../../data/raw_data/video"
    qns_data = csv_data.loc[(csv_data['video']== vid) & (csv_data['qid']== qid)]
    v = Video(os.path.join(video_root,map_vid.get(str(vid))+'.mp4'), width=320, height=240)
    return v, qns_data

def transform_bb(roi_bbox, width, height):
    dshape = list(roi_bbox.shape)
    tmp_bbox = roi_bbox.reshape([-1, 4])
    relative_bbox = tmp_bbox / np.asarray([width, height, width, height])
    relative_area = (tmp_bbox[:, 2] - tmp_bbox[:, 0] + 1) * \
                    (tmp_bbox[:, 3] - tmp_bbox[:, 1] + 1)/ (width*height)
    relative_area = relative_area.reshape(-1, 1)
    bbox_feat = np.hstack((relative_bbox, relative_area))
    dshape[-1] += 1
    bbox_feat = bbox_feat.reshape(dshape)

    return bbox_feat

def get_video_feats(video_name, bbox_feats, frame_feats, mot_feats, width, height):
    video_feature = []
    
    roi_feat = bbox_feats[video_name][0]
    roi_feat = torch.from_numpy(roi_feat).type(torch.float32)
    roi_bbox = bbox_feats[video_name][1]

    bbox_feat = transform_bb(roi_bbox, width, height)
    bbox_feat = torch.from_numpy(bbox_feat).type(torch.float32)

    region_feat = torch.cat((roi_feat, bbox_feat), dim=-1)
    video_feature.append(region_feat)


    temp_feat = frame_feats[video_name]
    app_feat = torch.from_numpy(temp_feat).type(torch.float32)
    video_feature.append(app_feat)

    temp_feat = mot_feats[video_name]
    mot_feat = torch.from_numpy(temp_feat).type(torch.float32)
    video_feature.append(mot_feat)

    return video_feature

In [24]:
vid2path = load_json('../dataset/nextqa/map_vid_vidorID.json')
csv_data = read_csv('../dataset/nextqa/val.csv')

# load video feats
with h5py.File('../../data/nextqa/region_feat_n/region_16c20b_val.h5', 'r') as fp:
    bbox_feats = {}
    vids = fp['ids']
    feats = fp['feat']
    print(f'Loading region feats: {feats.shape}')  # v_num, clip_num, frame_per_clip, region_per_frame, feat_dim
    bboxes = fp['bbox']
    for id, (vid, feat, bbox) in enumerate(zip(vids, feats, bboxes)):
        # (clip, frame, bbox, feat), (clip, frame, bbox, coord)     
        bbox_feats[str(vid)] = (feat[:, :, :20, :], bbox[:, :, :20,:])  

with h5py.File('../../data/nextqa/frame_feat/app_feat_val.h5', 'r') as fp:
    frame_feats = {}
    vids = fp['ids']
    feats = fp['resnet_features']
    print(f'Loading frame feats: {feats.shape}')  # v_num, clip_num, frame_per_clip, feat_dim
    for id, (vid, feat) in enumerate(zip(vids, feats)):
        # self.frame_feats[str(vid)] = feat[::2]
        frame_feats[str(vid)] = feat

with h5py.File('../../data/nextqa/mot_feat/mot_feat_val.h5', 'r') as fp:
    mot_feats = {}
    vids = fp['ids']
    feats = fp['resnext_features']
    print(f'Loading mot feats: {feats.shape}')  # v_num, clip_num, feat_dim
    for id, (vid, feat) in enumerate(zip(vids, feats)):
        mot_feats[str(vid)] = feat

Loading region feats: (570, 16, 4, 20, 2048)
Loading frame feats: (570, 16, 4, 2048)
Loading mot feats: (570, 16, 2048)


In [47]:
data = get_wrong_ans("../results/nextqa/HQGA-bert-16c20b-2L05GCN-FCV-AC-ZJ-7c8s-nope&splited-aXQGA-val.json")

In [58]:
csv_data_video = csv_data[csv_data['video']==data[1000]['vid']]
csv_data_video

Unnamed: 0,video,question,answer,qid,type,a0,a1,a2,a3,a4,width,height
974,4970148391,what activity is the boy doing,0,7,TC,baseball,talking,jump,showing us an experiment,watching television,640,480
1072,4970148391,what is the boy hitting that it splashes water on his face,0,1,DO,water balloon,snow ball,milk pack,a cup of drink,beer bottle,640,480
1713,4970148391,what does the boy do after hitting the second balloon,2,10,TN,move his hand,start dancing,drop bat and take off cap,take off goggles,shake the man s hand,640,480
2069,4970148391,what does the boy do after hitting the first balloon,1,9,TN,stop crying,remove spectacles,look at mirror,hold railing of bed,play with toys,640,480
2323,4970148391,why does the boy removes goggles after hitting for first time,3,2,CW,catch his balance,to show man,take water out of nose,wipe water off,to get up,640,480
2662,4970148391,where is the boy hanging out,1,6,DL,indoor,field,on a platform,restaurant,beach,640,480
3792,4970148391,why is the boy dancing with the base bat in the beginning,1,0,CW,dropped his toy,playing it,talk to the person,practicing swing,doing a routine,640,480
4003,4970148391,why does the boy removes his cap after hitting for second time,4,3,CW,rub bat,touch the bed,give to man,kick ball,shake off water spill,640,480
4242,4970148391,what does the boy do when the balloon approaches,1,4,TC,run to trampoline,hit it,lean and look forward,throw bat,move baby up down,640,480
4374,4970148391,what happens to the water ball after being hit,0,5,TN,burst,pick up and push boys away,pass it around,straighten up,turn back,640,480


In [45]:
vid = 4970148391
qid = 1
v, qns = get_video_qns(csv_data, vid2path, vid, qid)
bbox_feat, frame_feat, mot_feat = get_video_feats(str(vid), bbox_feats, frame_feats, mot_feats, qns['width'].item(), qns['height'].item())

In [46]:
bbox_feat.shape


torch.Size([16, 4, 20, 2053])

In [59]:
tn_list = []
qs_list = []
out_dict = {'video':[],
        'question':[],
        'answer':[],
        'qid':[],
        'type':[],
        'a0':[],
        'a1':[],
        'a2':[],
        'a3':[],
        'a4':[]
       }
  
df = pd.DataFrame(out_dict)
for i in data:
    qns_data = csv_data_video.loc[(csv_data['video']== i['vid']) & (csv_data_video['qid']== i['qid'])]
    if not qns_data.empty:
        df = pd.concat([df, qns_data])
        tn_list.append(i)
#         assert 1==0 

In [60]:
vid, qid, pred_ans = tn_list[np.random.choice(range(len(tn_list)))].values()
v, qns = get_video_qns(csv_data, vid2path, vid, qid)
v

In [61]:
df

Unnamed: 0,video,question,answer,qid,type,a0,a1,a2,a3,a4,width,height
1072,4970148000.0,what is the boy hitting that it splashes water on his face,0.0,1.0,DO,water balloon,snow ball,milk pack,a cup of drink,beer bottle,640.0,480.0
2069,4970148000.0,what does the boy do after hitting the first balloon,1.0,9.0,TN,stop crying,remove spectacles,look at mirror,hold railing of bed,play with toys,640.0,480.0
2323,4970148000.0,why does the boy removes goggles after hitting for first time,3.0,2.0,CW,catch his balance,to show man,take water out of nose,wipe water off,to get up,640.0,480.0
3792,4970148000.0,why is the boy dancing with the base bat in the beginning,1.0,0.0,CW,dropped his toy,playing it,talk to the person,practicing swing,doing a routine,640.0,480.0
4003,4970148000.0,why does the boy removes his cap after hitting for second time,4.0,3.0,CW,rub bat,touch the bed,give to man,kick ball,shake off water spill,640.0,480.0
4242,4970148000.0,what does the boy do when the balloon approaches,1.0,4.0,TC,run to trampoline,hit it,lean and look forward,throw bat,move baby up down,640.0,480.0
4597,4970148000.0,how does the boy keep the bat stable on the ground,0.0,8.0,CH,lean against his body,sit on cushion,put on stone,swing chair is protected,wear shoes,640.0,480.0


In [62]:
tn_list

[{'vid': 4970148391, 'qid': 1, 'predicted_ans': 1},
 {'vid': 4970148391, 'qid': 9, 'predicted_ans': 0},
 {'vid': 4970148391, 'qid': 2, 'predicted_ans': 0},
 {'vid': 4970148391, 'qid': 0, 'predicted_ans': 4},
 {'vid': 4970148391, 'qid': 3, 'predicted_ans': 0},
 {'vid': 4970148391, 'qid': 4, 'predicted_ans': 3},
 {'vid': 4970148391, 'qid': 8, 'predicted_ans': 2}]

In [33]:
sents = []
count = 0
for qsent in csv_data['question']:
    if "before" in qsent:
        count += 1
    elif "after" in qsent:
        count += 1
    sents.append(qsent)
print(len(sents))
print(count)

4996
1172


In [34]:
count_wrong = 0
for i in data:
    qns_data = csv_data.loc[(csv_data['video']== i['vid']) & (csv_data['qid']== i['qid'])]
    if not qns_data.empty:
        for item in qns_data['question']:
            if "before" in item or "after" in item:
                count_wrong += 1
print(count_wrong)

621


In [35]:
len(data)

2445