# Generate Language CLIP

In [1]:
import clip
import torch
import pandas as pd
import numpy as np
import json
import h5py
import os
from tqdm.notebook import tqdm

In [2]:
def read_csv(path):
    data = pd.read_csv(path)
    data = data.loc[:, ['video','question','answer','qid','type','a0','a1','a2','a3','a4']]
    return data

def vaild_text_length(token):
    i = 0
    for idx in token[0]:
        if idx == 0:
            break
        i += 1
    return i

def encode_text(model, text):
    text_token = clip.tokenize(text).to('cuda')
    text_length = vaild_text_length(text_token)
    x = model.token_embedding(text_token).type(model.dtype)
    x = x + model.positional_embedding.type(model.dtype)
    x = x.permute(1,0,2)
    x = model.transformer(x)
    x = x.permute(1,0,2)
    x = model.ln_final(x).type(model.dtype)
    # x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
    return x, text_length

In [3]:
clip_model, preprocess = clip.load('ViT-L/14@336px', device='cuda')

In [12]:
mode = 'train'  #[train, val, test]
csv_data = read_csv(f'../dataset/nextqa/{mode}.csv')
h5_file = f'../../data/nextqa/qas_bert/clip_ft_{mode}.h5'

In [13]:
if not os.path.exists(h5_file):
    f = h5py.File(h5_file, 'w')
else:
    f = h5py.File(h5_file, 'w')
    
qas_feats = []
qas_length = []
for idx, data in tqdm(csv_data.iterrows(), desc='encoding qas', total=csv_data.shape[0]):
    candidate_qas = []
    candidate_qas_length = []
    eot_token = '<|endoftext|>'
    for i in range(5):
        qas = data['question'] + eot_token + data[f'a{str(i)}']
        encode_qas, encode_qas_length = encode_text(clip_model, qas)
        candidate_qas.append(encode_qas.detach().cpu().numpy())
        candidate_qas_length.append(encode_qas_length)
        
    candidate_qas = np.array(candidate_qas)
    candidate_qas_length = np.array(candidate_qas_length)
    qas_feats.append(candidate_qas)
    qas_length.append(candidate_qas_length)

f.create_dataset('feat', data=qas_feats)
f.create_dataset('qas_length', data=qas_length)
f.close()

encoding qas:   0%|          | 0/34132 [00:00<?, ?it/s]

In [11]:
# Check data
with h5py.File(h5_file, 'r') as fp:
    print(fp['feat'][100].shape, fp['qas_length'][100])

(5, 1, 77, 768) [13 13 12 12 13]


In [27]:
clip.tokenize('I')

tensor([[49406,   328, 49407,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]])

# Generate Video CLIP

In [1]:
import clip
import torch
import pandas as pd
import numpy as np
import json
import os
import skvideo.io
from PIL import Image
import h5py
from tqdm.notebook import tqdm

In [2]:
def load_json(path):
    with open(path) as f:
        r = json.load(f)
    return r

def get_video_path(map_vid, vid):
    video_root = "../../data/raw_data/video"
    path = os.path.join(video_root,map_vid.get(str(vid))+'.mp4')
    return path

def read_csv(path):
    data = pd.read_csv(path)
    data = data.loc[:, ['video','question','answer','qid','type','a0','a1','a2','a3','a4']]
    return data

def extract_clips_with_consecutive_frames(path, num_clips, num_frames_per_clip, preprocess, clip_model):
    """
    from HCRN model preprocessing
    Args:
        path: path of a video
        num_clips: expected numbers of splitted clips
        num_frames_per_clip: number of frames in a single clip, pretrained model only supports 16 frames
    Returns:
        A list of raw features of clips.
    """
    valid = True
    clips = list()
    try:
        video_data = skvideo.io.vread(path)
    except:
        print('file {} error'.format(path))
        valid = False
        return list(np.zeros(shape=(num_clips, num_frames_per_clip, 3, 224, 224))), valid
    
    total_frames = video_data.shape[0]
    img_size = (video_data[0].shape[1], video_data[0].shape[0]) # (width, height)
    for i in np.linspace(0, total_frames, num_clips + 2, dtype=np.int32)[1:num_clips + 1]:
        clip_start = int(i) - int(num_frames_per_clip / 2)
        clip_end = int(i) + int(num_frames_per_clip / 2)
        if clip_start < 0:
            clip_start = 0
        if clip_end > total_frames:
            clip_end = total_frames - 1
        clip = video_data[clip_start:clip_end]
        if clip_start == 0:
            shortage = num_frames_per_clip - (clip_end - clip_start)
            added_frames = []
            for _ in range(shortage):
                added_frames.append(np.expand_dims(video_data[clip_start], axis=0))
            if len(added_frames) > 0:
                added_frames = np.concatenate(added_frames, axis=0)
                clip = np.concatenate((added_frames, clip), axis=0)
        if clip_end == (total_frames - 1):
            shortage = num_frames_per_clip - (clip_end - clip_start)
            added_frames = []
            for _ in range(shortage):
                added_frames.append(np.expand_dims(video_data[clip_end], axis=0))
            if len(added_frames) > 0:
                added_frames = np.concatenate(added_frames, axis=0)
                clip = np.concatenate((clip, added_frames), axis=0)
        new_clip = []
        for j in range(num_frames_per_clip):
            frame_data = clip[j]
            img = Image.fromarray(frame_data)
#             img = img.resize(img_size, resample=Image.BICUBIC)
# #             img = img.transpose(2, 0, 1)
#             frame_data = np.array(img)
#             new_clip.append(frame_data)

            #  DO CLIP encoding
            image = preprocess(img).unsqueeze(0).to('cuda:0')
            with torch.no_grad():
                frame_feat = model.encode_image(image).squeeze().cpu().numpy()
            new_clip.append(frame_feat)
            
#         new_clip = np.asarray(new_clip)  # (num_frames, width, height, channels)
#         if args.model in ['resnext101']:
#             new_clip = np.squeeze(new_clip)
#             new_clip = np.transpose(new_clip, axes=(1, 0, 2, 3))
        clips.append(new_clip)
    return clips, valid

In [3]:
model, preprocess = clip.load('ViT-L/14@336px', device='cuda:0')

In [3]:
vid2path = load_json('../dataset/nextqa/map_vid_vidorID.json')

mode = 'train'  #[train, val, test]
csv_data = read_csv(f'../dataset/nextqa/{mode}.csv')
h5_file = f'../../data/nextqa/frame_feat/clip_app_ft_{mode}.h5'

In [6]:
csv_data['video'].unique()

array([ 3238737531,  8968804598, 13884124143, ..., 10036075863,
        7975580325, 10294585855])

In [None]:
vids = []
feats = []

In [None]:
i=0
for vid in tqdm(csv_data['video'].unique(), desc='encoding video', total=len(csv_data['video'].unique())):
    if i <= 2274:
        i+=1
        continue
    video_path = get_video_path(vid2path,vid)
    raw_clip, vaild = extract_clips_with_consecutive_frames(video_path, 16, 4, preprocess, model)
    if vaild:
        vids.append(vid)
        feats.append(raw_clip)
    else:
        break
    

# vids = np.asarray(vids)
# feats = np.asarray(feats)

if not os.path.exists(h5_file):
    f = h5py.File(h5_file, 'w')
else:
    f = h5py.File(h5_file, 'w')
    
f.create_dataset('ids', data=vids)
f.create_dataset('features', data=feats)
f.close()

encoding video:   0%|          | 0/3870 [00:00<?, ?it/s]

In [None]:
# Check data
fp = h5py.File(h5_file, 'r')
load_vids = fp['ids']
load_feats = fp['features']
print(load_vids, np.array(load_feats).shape)
fp.close()

In [9]:
video_path = get_video_path(vid2path,2414793083)
raw_clip, vaild = extract_clips_with_consecutive_frames(video_path, 16, 2, preprocess, model)

In [13]:
np.array(raw_clip).shape

(16, 2, 768)

# CLIP test

In [None]:
import clip
import torch

In [2]:
model_list = clip.available_models()
model_list

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [20]:
embed_list = {}
for i in range(9):
    model, preprocess = clip.load(model_list[i], device='cpu')
    embed_list[model_list[i]] = model.encode_text(clip.tokenize('I you')).shape

In [21]:
embed_list

{'RN50': torch.Size([1, 1024]),
 'RN101': torch.Size([1, 512]),
 'RN50x4': torch.Size([1, 640]),
 'RN50x16': torch.Size([1, 768]),
 'RN50x64': torch.Size([1, 1024]),
 'ViT-B/32': torch.Size([1, 512]),
 'ViT-B/16': torch.Size([1, 512]),
 'ViT-L/14': torch.Size([1, 768]),
 'ViT-L/14@336px': torch.Size([1, 768])}

In [3]:
model, preprocess = clip.load(model_list[-1], device='cpu')
model.encode_text(clip.tokenize('I save your life')).shape

torch.Size([1, 768])

In [6]:
clip.tokenize('I save your life so.')

tensor([[49406,   328,  2673,   695,   970,   706,   269, 49407,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]])

In [16]:
model.encode_text(clip.tokenize('I'))

tensor([[ 1.5095e-01, -6.5090e-03,  5.4056e-01, -3.7528e-01, -6.8354e-02,
          9.7832e-02, -3.5222e-01,  5.4487e-02,  2.1441e-01, -6.7708e-01,
          1.7157e-01,  1.8295e-03, -8.2062e-01,  2.3109e-01, -4.7107e-01,
          1.3864e-01,  4.9570e-02,  4.5823e-01, -2.9067e-01, -3.0055e-01,
          6.6539e-02, -2.7940e-01,  8.0342e-02, -9.4442e-03,  9.9375e-02,
         -5.1597e-01,  4.8310e-01, -8.9906e-02,  4.9464e-01,  1.8817e-01,
         -3.9430e-01, -2.4976e-01, -1.4593e-01, -3.2245e-03, -3.8681e-01,
          3.8197e-01,  2.2733e-01, -3.4141e-01, -4.3623e-01, -3.0771e-01,
          5.8945e-02, -2.2727e-01,  1.0736e-01, -2.0524e-01,  2.6954e-02,
         -2.8702e-01, -1.1606e-01, -1.4593e-02, -3.3899e-02, -5.6178e-01,
          2.6186e-02, -1.1908e-01,  1.6539e-01, -1.8123e-01,  4.1063e-01,
          3.2647e-02, -1.9460e-02, -1.7700e-02,  1.0868e-01,  2.7433e-01,
         -3.3449e-01,  1.3509e-01,  1.7533e-01,  1.7811e-01, -2.4851e-01,
          6.0435e-01,  4.0570e-01,  5.

In [15]:
torch.cat([model.encode_text(clip.tokenize('I')),
          model.encode_text(clip.tokenize('not'))])

tensor([[ 0.1510, -0.0065,  0.5406,  ..., -0.5446, -0.4986, -0.0284],
        [ 0.1440, -0.0338,  0.3838,  ..., -0.2692, -0.1916, -0.2707]],
       grad_fn=<CatBackward>)