# Egunean Behin Visual Question Answering Dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!pip3 install transformers==4.15.0 timm==0.4.12 fairscale==0.4.4
!git clone https://github.com/salesforce/BLIP
%cd BLIP

Collecting transformers==4.15.0
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 9.7 MB/s 
[?25hCollecting timm==0.4.12
  Downloading timm-0.4.12-py3-none-any.whl (376 kB)
[K     |████████████████████████████████| 376 kB 70.2 MB/s 
[?25hCollecting fairscale==0.4.4
  Downloading fairscale-0.4.4.tar.gz (235 kB)
[K     |████████████████████████████████| 235 kB 65.9 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |███████████████████

## Create Dataset

### Figures

In [1]:
path_figures = "/content/drive/MyDrive/LAP/Subjects/DL/project/egunean-behin-vqa/data/figures/"

In [67]:
%cd $path_figures

/content/drive/MyDrive/LAP/Subjects/DL/project/egunean-behin-vqa/data/figures


In [None]:
!python create_images.py

In [None]:
!python create_questions.py

### Cubes

In [15]:
path_cubes = "/content/drive/MyDrive/LAP/Subjects/DL/project/egunean-behin-vqa/data/cubes/"

In [63]:
%cd $path_cubes

/content/drive/MyDrive/LAP/Subjects/DL/project/egunean-behin-vqa/data/cubes


In [None]:
!python create_images.py

In [None]:
!python create_questions.py

### Maze

In [16]:
path_maze = "/content/drive/MyDrive/LAP/Subjects/DL/project/egunean-behin-vqa/data/maze/"

In [65]:
%cd $path_maze

/content/drive/MyDrive/LAP/Subjects/DL/project/egunean-behin-vqa/data/maze


In [None]:
!python create_images.py

In [None]:
!python create_questions.py

## Test BLIP

In [None]:
%cd /content

In [6]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from models.blip_vqa import blip_vqa
from tqdm import tqdm
import pandas as pd


def load_image(image, image_size, device):
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ]) 
    image = transform(image).unsqueeze(0).to(device)
    return image


def load_model(model_url, image_size):
    model = blip_vqa(pretrained=model_url, image_size=image_size, vit='base')
    model.eval()
    model = model.to(device)
    return model


def inference(image, question): 
    image = load_image(image, image_size, device)
    with torch.no_grad():
        output = model(image, question, train=False, inference='generate') 
    return output[0]


def test(path, df):
    answers = []
    prev_path = ""
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        image_path = path + "images/" + row['image']
        if image_path != prev_path:
            image = Image.open(image_path).convert('RGB')
        answers.append(inference(image, row['question']))
        prev_path = image_path
    df['answer'] = answers

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
image_size = 480
model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth'
model = load_model(model_url, image_size)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

  0%|          | 0.00/1.35G [00:00<?, ?B/s]

load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth


### Figures

In [8]:
df_figures = pd.read_csv(path_figures + "questions.csv")
df_figures

Unnamed: 0,type,question,correct,wrong1,wrong2,image
0,Figures,How many figures?,24,29,26,figures_6_4_417148_466526_041585_724774.png
1,Figures,How many colums?,6,4,8,figures_6_4_417148_466526_041585_724774.png
2,Figures,How many rows?,4,3,2,figures_6_4_417148_466526_041585_724774.png
3,Figures,How many triangles?,6,5,8,figures_6_4_417148_466526_041585_724774.png
4,Figures,How many squares?,9,7,11,figures_6_4_417148_466526_041585_724774.png
5,Figures,How many circles?,9,11,8,figures_6_4_417148_466526_041585_724774.png
6,Figures,How many red figures?,4,5,3,figures_6_4_417148_466526_041585_724774.png
7,Figures,How many green figures?,13,15,17,figures_6_4_417148_466526_041585_724774.png
8,Figures,How many blue figures?,7,9,10,figures_6_4_417148_466526_041585_724774.png
9,Figures,How many red triangles?,1,3,0,figures_6_4_417148_466526_041585_724774.png


In [9]:
test(path_figures, df_figures)

100%|██████████| 18/18 [00:07<00:00,  2.32it/s]


In [10]:
df_figures

Unnamed: 0,type,question,correct,wrong1,wrong2,image,answer
0,Figures,How many figures?,24,29,26,figures_6_4_417148_466526_041585_724774.png,4
1,Figures,How many colums?,6,4,8,figures_6_4_417148_466526_041585_724774.png,3
2,Figures,How many rows?,4,3,2,figures_6_4_417148_466526_041585_724774.png,3
3,Figures,How many triangles?,6,5,8,figures_6_4_417148_466526_041585_724774.png,5
4,Figures,How many squares?,9,7,11,figures_6_4_417148_466526_041585_724774.png,9
5,Figures,How many circles?,9,11,8,figures_6_4_417148_466526_041585_724774.png,9
6,Figures,How many red figures?,4,5,3,figures_6_4_417148_466526_041585_724774.png,one
7,Figures,How many green figures?,13,15,17,figures_6_4_417148_466526_041585_724774.png,3
8,Figures,How many blue figures?,7,9,10,figures_6_4_417148_466526_041585_724774.png,one
9,Figures,How many red triangles?,1,3,0,figures_6_4_417148_466526_041585_724774.png,2


### Cubes

In [17]:
df_cubes = pd.read_csv(path_cubes + "questions.csv")
df_cubes

Unnamed: 0,type,question,correct,wrong1,wrong2,image
0,Cubes,How many cubes in total?,26,22,21,cubes_4_4_3_0002_0013_1133_3333.png
1,Cubes,How many visible cubes?,17,16,11,cubes_4_4_3_0002_0013_1133_3333.png
2,Cubes,How many non visible cubes?,9,13,10,cubes_4_4_3_0002_0013_1133_3333.png
3,Cubes,How many cubes in layer x 1?,2,0,1,cubes_4_4_3_0002_0013_1133_3333.png
4,Cubes,How many cubes in layer x 2?,4,3,6,cubes_4_4_3_0002_0013_1133_3333.png
5,Cubes,How many cubes in layer x 3?,8,11,10,cubes_4_4_3_0002_0013_1133_3333.png
6,Cubes,How many cubes in layer x 4?,12,15,13,cubes_4_4_3_0002_0013_1133_3333.png
7,Cubes,How many cubes in layer y 1?,4,6,7,cubes_4_4_3_0002_0013_1133_3333.png
8,Cubes,How many cubes in layer y 2?,4,3,1,cubes_4_4_3_0002_0013_1133_3333.png
9,Cubes,How many cubes in layer y 3?,7,9,8,cubes_4_4_3_0002_0013_1133_3333.png


In [18]:
test(path_cubes, df_cubes)

100%|██████████| 14/14 [00:04<00:00,  2.90it/s]


In [19]:
df_cubes

Unnamed: 0,type,question,correct,wrong1,wrong2,image,answer
0,Cubes,How many cubes in total?,26,22,21,cubes_4_4_3_0002_0013_1133_3333.png,12
1,Cubes,How many visible cubes?,17,16,11,cubes_4_4_3_0002_0013_1133_3333.png,6
2,Cubes,How many non visible cubes?,9,13,10,cubes_4_4_3_0002_0013_1133_3333.png,1
3,Cubes,How many cubes in layer x 1?,2,0,1,cubes_4_4_3_0002_0013_1133_3333.png,6
4,Cubes,How many cubes in layer x 2?,4,3,6,cubes_4_4_3_0002_0013_1133_3333.png,6
5,Cubes,How many cubes in layer x 3?,8,11,10,cubes_4_4_3_0002_0013_1133_3333.png,6
6,Cubes,How many cubes in layer x 4?,12,15,13,cubes_4_4_3_0002_0013_1133_3333.png,6
7,Cubes,How many cubes in layer y 1?,4,6,7,cubes_4_4_3_0002_0013_1133_3333.png,4
8,Cubes,How many cubes in layer y 2?,4,3,1,cubes_4_4_3_0002_0013_1133_3333.png,3
9,Cubes,How many cubes in layer y 3?,7,9,8,cubes_4_4_3_0002_0013_1133_3333.png,3


### Maze

In [21]:
df_maze = pd.read_csv(path_maze + "questions.csv")
df_maze

Unnamed: 0,type,question,correct,wrong1,wrong2,image
0,Maze,How many cells?,96,98,87,maze_0_12_8_0_2.png
1,Maze,How many colums?,12,11,9,maze_0_12_8_0_2.png
2,Maze,How many rows?,8,7,5,maze_0_12_8_0_2.png
3,Maze,Which is the exit starting from green?,blue,red,yellow,maze_0_12_8_0_2.png


In [22]:
test(path_maze, df_maze)

100%|██████████| 4/4 [00:01<00:00,  2.84it/s]


In [23]:
df_maze

Unnamed: 0,type,question,correct,wrong1,wrong2,image,answer
0,Maze,How many cells?,96,98,87,maze_0_12_8_0_2.png,8
1,Maze,How many colums?,12,11,9,maze_0_12_8_0_2.png,0
2,Maze,How many rows?,8,7,5,maze_0_12_8_0_2.png,3
3,Maze,Which is the exit starting from green?,blue,red,yellow,maze_0_12_8_0_2.png,left
