In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install transformers
!pip install -U sacremoses
!pip install nltk

In [11]:
import torch
from torch import nn
import transformers
from torch.utils.data import Dataset
from transformers import CLIPProcessor, CLIPModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel

import torchvision.transforms as T
import torchvision.transforms.functional as TF
import torchvision

import os
import pandas as pd
import cv2

import PIL
from PIL import Image

from dataclasses import dataclass, field
from tqdm import tqdm

import clip
import pickle

from tqdm.contrib import tzip
from tqdm.notebook import tqdm

import argparse

from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from transformers import ViltProcessor, ViltForQuestionAnswering

import nltk

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
config = {
    "model": "dandelin/vilt-b32-finetuned-vqa",
    "en-ru": "facebook/wmt19-en-ru",
    "ru-en": "facebook/wmt19-ru-en",
    "labels": "updatedtrain.csv",
    "video_path": "videos/",
    "output": "output/",
    "checkpoint": "checkpoint/"
}

In [14]:
def read_video(path, transform=None, frames_num=16, window=30):
    frames = []
    cap = cv2.VideoCapture(path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    N = length//(frames_num)
    
    current_frame = 1
    for i in range(length):
        ret, frame = cap.read(current_frame)
        if ret and i==current_frame and len(frames)<frames_num:
            size = 64, 64
            frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            frame.thumbnail(size, Image.ANTIALIAS)
            
            frames.append(frame)
            current_frame += N
    cap.release()
    return frames

def image_grid(imgs, rows, cols):
    pils = imgs
    
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

In [43]:
class Translater:
    
    def __init__(self, name):
        self.tokenizer = FSMTTokenizer.from_pretrained(name)
        self.model = FSMTForConditionalGeneration.from_pretrained(name)

    def translate(self, text):
        input_ids = self.tokenizer.encode(text, return_tensors="pt")
        outputs = self.model.generate(input_ids)
        decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return decoded

In [44]:
en_ru = Translater(config['en-ru'])
ru_en = Translater(config['ru-en'])

In [17]:
processor = ViltProcessor.from_pretrained(config['model'])
model = ViltForQuestionAnswering.from_pretrained(config['model'])

Downloading (…)rocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/470M [00:00<?, ?B/s]

In [18]:
labels = pd.read_csv("/content/updatedtrain.csv")

In [None]:
for video_name, question, answer in tzip(labels.video_name, labels.question, labels.answer):
    
    name = f'{video_name}/{video_name}.mp4'
    
    video = read_video(name, frames_num=16)
    print(len(video))

    img = image_grid(video, 4, 4)
    
    encoding = processor(img, question, return_tensors="pt")
    outputs = model(**encoding)
    logits = outputs.logits
    idx = logits.argmax(-1).item()
    
    predict = model.config.id2label[idx]
    print(nltk.translate.bleu_score.sentence_bleu([predict.lower().split()], answer.lower().replace('<|endoftext|>','').split(), weights = (0.5, 0.5)))
    

In [47]:
def predict(video_name, question, answer):
  name = f'/content/{video_name}.mp4'
    
  video = read_video("/content/__c8enCfzqw.mp4", frames_num=16) 
  img = image_grid(video, 4, 4)

  question = ru_en.translate(question)
  print(question)

  encoding = processor(img, question, return_tensors="pt")
  outputs = model(**encoding)
  logits = outputs.logits
  idx = logits.argmax(-1).item()
    
  predict = model.config.id2label[idx]
  predict = en_ru.translate(predict)

  print("BLEU: ", nltk.translate.bleu_score.sentence_bleu([predict.lower().split()], answer.lower().replace('<|endoftext|>','').split(), weights = (0.5, 0.5)))

  return predict  

In [50]:
predict("/content/__c8enCfzqw.mp4", "какого цвета куртка у сидящего на видео", "сделать прическу")

What color is the jacket in the person sitting in the video?
красный
0


In [28]:
labels[labels['video_name']=="__c8enCfzqw"]

Unnamed: 0.1,Unnamed: 0,video_name,question,answer
5022,5022,__c8enCfzqw,"что делает человек, сидящий на видео",сделать прическу
12520,12520,__c8enCfzqw,"какого пола человек, сидящий на видео",женский
13725,13725,__c8enCfzqw,у кого на видео кольцо на руке,верно
14557,14557,__c8enCfzqw,почему они используют кондиционер для волос,защитить волосы
15739,15739,__c8enCfzqw,"что находится позади человека, сидящего на видео",занавес
16077,16077,__c8enCfzqw,какого цвета куртка у сидящего на видео,черный
20399,20399,__c8enCfzqw,"что случилось с человеком, сидящим на видео, д...",вытирающий кондиционер
21464,21464,__c8enCfzqw,человек на видео сидит в помещении,да
21466,21466,__c8enCfzqw,"это человек, сидящий снаружи на видео",нет
