In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from transformers import BertModel, BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

bert = BertModel.from_pretrained("bert-base-uncased")
embedding_matrix = bert.embeddings.word_embeddings.weight

In [None]:
PAD_ID = 0
CLS_ID = 101
device = "cuda:0"

In [None]:
input_text = "Here is some text to encode"
input_ids = tokenizer.encode(input_text, add_special_tokens=True)
# you can get BERT embeddings like this:
embedding_matrix[input_ids].shape, input_ids

In [None]:
#Let's begin !
from torch.utils.data import Dataset
import pickle
import json
import csv 
import torch

class VQADataset(Dataset):

    def __init__(self, split_path):
        image_features_path = "/kaggle/input/iust-vqa/image_features.pickle"
        answers_list_path = "/kaggle/input/iust-vqa/answer_list.txt"
        image2questions_path = "/kaggle/input/iust-vqa/image_question.json"
        
        ## Read image features, use pickle!
        with open(image_features_path, 'rb') as f:
            ### YOUR CODE HERE
            self.image_features = pickle.load(f)
            ### YOUR CODE HERE
        
        ##sample: self.question2img[q_id] = img_id
        self.question2img = {}
        
        ##sample: self.questions[q_id] = {"text" : q_text, "tokenized" : tokenized_question}
        ## tokenization: tokenizer.encode(sentence)
        self.questions = {}
        
        with open(image2questions_path, 'r') as f:
            ## YOUR CODE HERE
            ## Load json file (image2questions)
            data = json.load(f)
            
            ## retrieve requested values "self.question2img", "self.questions" from givenn json
            ## ~ 6 lines
            for img_id in data:
                for question in data[img_id]:
                    q_id = question[0]
                    q_text = question[1]
                    
                    self.questions[q_id] = {"text" : q_text, "tokenized" : tokenizer.encode(q_text)}
                    self.question2img[q_id] = img_id
            ### YOUR CODE HERE
        
        self.possible_answers = []
        with open(answers_list_path, 'r') as f:
            ## read answers list from text file, save them in an array
            self.possible_answers = f.read().split()
        
        ## sample: self.data[idx] = q_id
        self.data = []
        ## sample: self.labels[idx] = 4
        self.labels = []
        
        
        
        ## load data from "split_path", fill self.data and self.labels as requested! take a look at train.csv
        # https://docs.python.org/3/library/csv.html#csv.DictReader
        with open(split_path, newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                self.data.append(int(row['question_id']))
                if (row['label'] is not None):
                    self.labels.append(int(row['label']))
                else:
                    self.labels = None

    def __getitem__(self, idx):
        """This method returns tuple of (question_id, image_features (Tensor), tokenized_question (Tensor), label
        
        Note: label can be None!
        """
        
        
        q_id = self.data[idx]
        ### YOUR CODE HERE
        ## WARNING: while making tensors, DO NOT FORGET TO USE .to(device) at the end!
        #torch.tensor([0]).to(device)  *GPU
        q_tokenized = self.questions[q_id]['tokenized']
        img_id =self.question2img[q_id]
        label = None
        if(self.labels is not None):
            label = self.labels[idx]
        return q_id, torch.tensor(self.image_features[img_id]).to(device), torch.tensor(q_tokenized).to(device), label
        ### YOUR CODE HERE
    
    def __len__(self):
        return len(self.data)

In [None]:
from torch.nn.utils.rnn import pad_sequence


def collate_batch(batch):
    """
        Batch post processing, we can pad questions! 
        returns q_ids, images (Tensor), questions(Tensor), labels (Tensor)
    """
    images = []
    questions = []
    labels = []
    q_ids = []
    
    ### YOUR CODE HERE
    ## WARNING: while making tensors, DO NOT FORGET TO USE .to(device) at the end!
    for q_id, img, q_tokens, label in batch:
        q_ids.append(q_id)
        images.append(img)
        questions.append(q_tokens)
        if (label is not None):
            labels.append(torch.tensor(label))
        else:
            labels = None
    ### Stack images into one tensor
    ## torch.stack, shape must be (batch_size, img_features)
    images = torch.stack(images, dim = 0)
    
    ## stack labels if they're not None, else make labels None!
    if (labels is not None):
        labels = torch.stack(labels, dim = 0)
    else:
        labels = None
    
    ## pad questions, shape must be (batch_size, longest_sentence)
    ## https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html
    questions = pad_sequence(questions, padding_value = 0, batch_first = True)
    
    
    return q_ids, images, questions, labels

In [None]:
from torch.utils.data import DataLoader
dset = VQADataset("/kaggle/input/iust-vqa/train.csv")
data_loader_train = DataLoader(dset, collate_fn=collate_batch, batch_size=32)

In [None]:
from torch import nn

## Nothing, just look =)))

class PositionalEncoder(nn.Module):
    """Positional encoding class pulled from the PyTorch documentation tutorial
    on Transformers for seq2seq models:
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoder, self).__init__()

        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float()\
                             * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
from torch import nn
import math

#The most interesting part!

class VQA_Simple(nn.Module):
    def __init__(self, dropout, text_hidden_size, n_layers, n_heads, image_hidden_size, n_outputs):
        super().__init__()
        self.dropout = dropout
        self.d_model = text_hidden_size
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.image_hidden_size = image_hidden_size
        self.PAD = PAD_ID
        
        self.embedding_matrix = bert.embeddings.word_embeddings.weight
        
        
        ##initilize TransformerEncoderLayer
        ##https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html
        encoder_layer = nn.TransformerEncoderLayer(d_model = self.d_model,
                                                   nhead = self.n_heads,
                                                   dropout = self.dropout)
        
        ##initilize TransformerEncoder
        ##https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html
        self.t_encoder = nn.TransformerEncoder(encoder_layer, num_layers = self.n_layers)
        
        ##if you looke enough, you can initilize positional encoder!!
        self.pe = PositionalEncoder(d_model = self.d_model, dropout = self.dropout)
        
        ##initilize TransformerDecoderLayer
        ##https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoderLayer.html
        decoder_layer = nn.TransformerDecoderLayer(d_model = self.d_model,
                                                   nhead = self.n_heads)
        
        ##initilize TransformerDecoder
        ##https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html
        self.t_decoder = nn.TransformerDecoder(decoder_layer, num_layers = self.n_layers)
        
        ##Linear output, recieves concatenation of text and image features, outputs final answer!
        self.linear = nn.Linear(self.d_model + self.image_hidden_size, n_outputs)
        
    def forward(self, images, input_ids):
        ##images shape: (batch_size, img_features)
        ##input_ids shape: (batch_size, sequence_len)
        b_size = images.shape[0]
        
        ### YOUR CODE HERE
        
        ## Calculate masks, shape: (batch_size, sequence_len)
        src_key_mask = (input_ids == self.PAD)
        ##embeddings of the given input_ids, extracted from self.embedding_matrix
        ##shape should be (sequence_len, batch_size, text_embedding_features)
        embeddings = self.embedding_matrix[input_ids].permute(1, 0, 2)
        
        ##Positional embeddings
        ##shape should be (sequence_len, batch_size, text_embedding_features)
        positional_embeddings = self.pe(embeddings)
        
        ## feed positinal_embeddings to the encoder!
        ## output shape should be (sequence_len, batch_size, d_model)
        ## additional args:  src_key_padding_mask
        encoder_output = self.t_encoder(positional_embeddings, src_key_padding_mask=src_key_mask)
        
        ##(batch_size, 1)
        tgt = torch.tensor([CLS_ID] * b_size).unsqueeze(1).to(device)
        ##(batch_size, 1)
        tgt_key_padding_mask = (tgt == self.PAD)
        
        ##embeddings of the given input_ids, extracted from self.embedding_matrix
        ##shape should be (1, batch_size, text_embedding_features)
        tgt_embeddings = self.embedding_matrix[tgt].permute(1, 0, 2)
        

        # target attention masks to avoid future tokens in our predictions
        # Adapted from PyTorch source code:
        # https://github.com/pytorch/pytorch/blob/176174a68ba2d36b9a5aaef0943421682ecc66d4/torch/nn/modules/transformer.py#L130
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(1).to(device)
        
        ## Positional embedding 
        tgt_positions = self.pe(tgt_embeddings)
        
        output = self.t_decoder(tgt=tgt_positions, 
                                memory=encoder_output,
                                tgt_mask=tgt_mask,
                                tgt_key_padding_mask = tgt_key_padding_mask, 
                                memory_key_padding_mask = src_key_mask) ##(1, batch_size, text_embedding_features)
        
        
        
        output_text = output.permute(1, 0, 2).squeeze(1) ## (batch_size, text_embedding_features)
        
        ##https://pytorch.org/docs/stable/generated/torch.cat.html
        #concatenate text output and image features
        concatenated = torch.cat([images, output_text], dim=1).to(device)
        
        
        y = self.linear(concatenated)
        
        return y


In [None]:
from tqdm import tqdm
model = VQA_Simple(dropout=0.1, 
                   text_hidden_size=768, 
                   n_layers=2, 
                   n_heads=6, 
                   image_hidden_size=512, 
                   n_outputs=10).cuda()
lr=1e-4
epochs = 25
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(list(model.parameters()), lr=lr)

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for i, (q_ids, images, questions, labels) in enumerate(pbar := tqdm(data_loader_train, total=len(data_loader_train))):
        pbar.set_description(f"Epoch {epoch}")
        
        optimizer.zero_grad()
        output = model(images, questions)
        
        loss = criterion(output.cpu(), labels)
        running_loss += loss
        
        loss.backward()
        optimizer.step()
        log_interval = 5
        pbar.set_postfix(loss=running_loss/(i+1))
        

In [None]:
def predict(data_loader, net):
    predicts = []
    ids = []
    net.eval()
    for i, (q_ids, images, questions, _) in enumerate(pbar := tqdm(data_loader, total=len(data_loader))):
        outputs = net(images, questions)
        outputs = torch.argmax(outputs, dim=1)
        predicts.extend(outputs.cpu().tolist())
        ids.extend(q_ids)
    return predicts, ids


test_dset = VQADataset("/kaggle/input/iust-vqa/test.csv")
data_loader_test = DataLoader(test_dset, collate_fn=collate_batch, batch_size=8)
preds, ids = predict(data_loader_test, model)

# with open("output.txt")

In [None]:
import pandas as pd

output_data = {"question_id": [str(id) for id in ids], "label": preds}
df = pd.DataFrame(output_data)
df.to_csv("/kaggle/working/output.csv", index=False)