In [3]:
!pip install -q transformers sentencepiece

Setup : Importing required libraries

In [4]:
import os
from pathlib import Path
import json
from collections import Counter
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cuda


Mount Google Drive & paths

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

BASE = Path('/content/drive/MyDrive/Data')
print('BASE:', BASE)
DATA_CSV = BASE / 'data.csv'
ANSWER_SPACE = BASE / 'answer_space.txt'
FEATURES_DIR = BASE / 'features'             # used later for fusion

OUT_DIR = BASE / 'text_features_bert'
OUT_DIR.mkdir(parents=True, exist_ok=True)
print('Output dir:', OUT_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
BASE: /content/drive/MyDrive/Data
Output dir: /content/drive/MyDrive/Data/text_features_bert


Load CSV

In [6]:
if DATA_CSV.exists():
    df = pd.read_csv(DATA_CSV)
    # try to ensure columns exist; fallback if CSV has no headers
    if set(['question','answer','image_id']).issubset(set(df.columns)):
        df = df[['question','answer','image_id']]
    else:
        df = pd.read_csv(DATA_CSV, header=None)
        df.columns = ['question','answer','image_id']
else:
    raise FileNotFoundError(f'{DATA_CSV} not found!')

In [7]:
df['question'] = df['question'].astype(str)
df['answer']   = df['answer'].astype(str)
df['image_id'] = df['image_id'].astype(str).str.strip()


In [8]:
# drop empty image ids and accidental header rows
df = df[df['image_id'].notna() & (df['image_id'] != '')]
df = df[df['image_id'].str.lower() != 'image_id']
df = df.reset_index(drop=True)

print('Rows loaded:', len(df))
df.head(6)

Rows loaded: 12468


Unnamed: 0,question,answer,image_id
0,what is on the left side of the white oven on ...,garbage_bin,image1
1,what is on the left side of the fire extinguis...,table,image1
2,what is between the the two white and black ga...,chair,image1
3,how many objects are between the fire extingui...,3,image1
4,what is the largest object in this picture,washing_machine,image1
5,how many plastic bottles are between the white...,5,image2


BERT settings

In [9]:
BERT_MODEL = 'bert-base-uncased'
MAX_LEN = 20           # max tokens per question (truncation/padding)
BATCH_SIZE = 64
extract_bert_features = True

Load tokenizer + model

In [10]:
print('Loading tokenizer & model:', BERT_MODEL)
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
bert_model = AutoModel.from_pretrained(BERT_MODEL)
bert_model.to(device)
bert_model.eval()

if extract_bert_features:
    for p in bert_model.parameters():
        p.requires_grad = False
    print('BERT frozen for feature extraction (pooled outputs will be saved).')
else:
    print('BERT left trainable (you intend to fine-tune later).')


Loading tokenizer & model: bert-base-uncased


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


BERT frozen for feature extraction (pooled outputs will be saved).


Save tokenizer to OUT_DIR

In [11]:
tokenizer.save_pretrained(str(OUT_DIR / 'tokenizer'))
print('Tokenizer saved to', OUT_DIR / 'tokenizer')

Tokenizer saved to /content/drive/MyDrive/Data/text_features_bert/tokenizer


Tokenize all questions

In [12]:
questions = df['question'].tolist()
print('Total questions:', len(questions))

# Tokenize with padding/truncation to MAX_LEN
encodings = tokenizer(questions, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt')
input_ids = encodings['input_ids']           # (N, MAX_LEN)
attention_mask = encodings['attention_mask'] # (N, MAX_LEN)

Total questions: 12468


Save tokenized inputs

In [13]:
np.save(OUT_DIR / 'input_ids.npy', input_ids.numpy())
np.save(OUT_DIR / 'attention_mask.npy', attention_mask.numpy())
print('Saved input_ids.npy and attention_mask.npy to', OUT_DIR)

Saved input_ids.npy and attention_mask.npy to /content/drive/MyDrive/Data/text_features_bert


extract pooled features using frozen BERT

In [14]:
if extract_bert_features:
    from torch.utils.data import TensorDataset, DataLoader
    ds = TensorDataset(input_ids, attention_mask)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    all_pooled = []
    bert_model.eval()
    with torch.no_grad():
        for b in loader:
            ids = b[0].to(device)
            mask = b[1].to(device)
            out = bert_model(input_ids=ids, attention_mask=mask)
            # use pooler_output if available, otherwise use CLS token hidden state
            if hasattr(out, 'pooler_output') and out.pooler_output is not None:
                pooled = out.pooler_output           # (B, 768)
            else:
                pooled = out.last_hidden_state[:, 0, :]
            all_pooled.append(pooled.cpu())
    all_pooled = torch.cat(all_pooled, dim=0)    # (N, 768)
    np.save(OUT_DIR / 'q_bert_pooled.npy', all_pooled.numpy())
    print('Saved q_bert_pooled.npy (shape {}) to {}'.format(all_pooled.shape, OUT_DIR))
else:
    print('Skipped BERT feature extraction (set extract_bert_features=True to extract).')

Saved q_bert_pooled.npy (shape torch.Size([12468, 768])) to /content/drive/MyDrive/Data/text_features_bert


Answer processing

In [15]:
if ANSWER_SPACE.exists():
    with open(ANSWER_SPACE, 'r') as f:
        answers = [line.strip() for line in f if line.strip()]
    answer2idx = {a: i for i, a in enumerate(answers)}
    print('Loaded answer space from file, size =', len(answer2idx))
else:
    ans_counter = Counter(df['answer'])
    TOP_K = 1000   # you can reduce/increase this depending on dataset
    most_common = [a for a, _ in ans_counter.most_common(TOP_K)]
    answer2idx = {a: i for i, a in enumerate(most_common)}
    print('Derived answer space from data, size =', len(answer2idx))

def map_answer(a):
    return answer2idx.get(a, len(answer2idx))   # unknown -> new index (last)

answer_idx = df['answer'].apply(map_answer).values
np.save(OUT_DIR / 'answer_idx.npy', answer_idx)
with open(OUT_DIR / 'answer2idx.json', 'w') as f:
    json.dump(answer2idx, f)
print('Saved answer_idx.npy and answer2idx.json to', OUT_DIR)

Loaded answer space from file, size = 582
Saved answer_idx.npy and answer2idx.json to /content/drive/MyDrive/Data/text_features_bert


Build Dataset for fusion/training

use_pooled=True  -> returns (q_feat_tensor  (768,), image_id, ans_idx)

use_pooled=False -> returns (input_ids, attention_mask, image_id, ans_idx) for fine-tuning

In [16]:
class VQABERTQuestionDataset(Dataset):
    def __init__(self, image_ids, answer_idx, out_dir, use_pooled=True):
        self.image_ids = list(image_ids)
        self.answer_idx = np.array(answer_idx)
        self.use_pooled = use_pooled
        self.out_dir = Path(out_dir)
        if use_pooled:
            # load pooled vectors (N, 768)
            self.q_feats = np.load(self.out_dir / 'q_bert_pooled.npy')
        else:
            self.input_ids = np.load(self.out_dir / 'input_ids.npy')
            self.attention_mask = np.load(self.out_dir / 'attention_mask.npy')

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_id = self.image_ids[idx]
        ans = int(self.answer_idx[idx])
        if self.use_pooled:
            qf = torch.tensor(self.q_feats[idx], dtype=torch.float)
            return qf, img_id, ans
        else:
            ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
            mask = torch.tensor(self.attention_mask[idx], dtype=torch.long)
            return ids, mask, img_id, ans

instantiate dataset

In [17]:
use_pooled = extract_bert_features
image_ids = df['image_id'].tolist()
dataset = VQABERTQuestionDataset(image_ids, answer_idx, OUT_DIR, use_pooled=use_pooled)
print('Dataset size:', len(dataset), 'use_pooled=', use_pooled)

Dataset size: 12468 use_pooled= True


In [18]:
loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0)
batch = next(iter(loader))
if use_pooled:
    q_feats_batch, img_ids_batch, ans_batch = batch
    print('q_feats_batch shape:', q_feats_batch.shape)
else:
    ids_batch, mask_batch, img_ids_batch, ans_batch = batch
    print('input_ids shape:', ids_batch.shape, 'attention_mask shape:', mask_batch.shape)

q_feats_batch shape: torch.Size([16, 768])
