## Installation

In [None]:
#INSTALLING PYTORCH
#For cpu only
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [None]:
!pip install pyyaml==5.1
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.5'
!pip install pandas
!pip install transformers
!pip install -U scikit-learn scipy matplotlib
!pip install datasets 
# if got error: 
# ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`
#Rememeber to restart the kernel after updagrade
# !pip install accelerate --upgrade 

## Model
Reference: 
- https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing#scrollTo=7-5rqN-vtlkq
- https://github.com/Ikea-179/Hateful-Meme-Detection/blob/main/VisualBERT.ipynb

In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
import torch, torchvision
import matplotlib.pyplot as plt
import json
import cv2
import numpy as np
from copy import deepcopy
from visual_embedding.visual_embeding_detectron2 import VisualEmbedder
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.structures.image_list import ImageList
from detectron2.data import transforms as T
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputs
from detectron2.structures.boxes import Boxes
from detectron2.layers import nms
from detectron2 import model_zoo
from detectron2.config import get_cfg

### Dataset

In [26]:
data_path='../data/hateful_memes/train_df_wQuery_.jsonl'
import pandas as pd
img_data = pd.read_json(path_or_buf=data_path, lines=True).to_dict(orient='records')
print(len(img_data))
print(img_data[0].keys())

8500
dict_keys(['id', 'img', 'label', 'text', 'query_1'])


In [29]:
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
import os
import pandas as pd
from torchvision.io import read_image
import torch
from torch.utils.data import Dataset

class HatefulMemesData(Dataset):
    def __init__(self, df,img_dir, tokenizer, sequence_length,caption_sequence_length=512, visual_embed_model='vit', print_text=False, visual_embeder_detecron2=None):         

        self.sequence_length = sequence_length
        self.caption_sequence_length= caption_sequence_length
        self.tokenizer = tokenizer
        self.print_text = print_text
        self.dataset = pd.read_json(path_or_buf=df, lines=True).to_dict(orient='records')
        self.img_dir = img_dir
        self.visual_embed_model = visual_embed_model
        self.feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
        self.feature_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k').to('cuda')
        if self.visual_embed_model=='detectron2' and visual_embeder_detecron2 is not None:
            self.visualembedder=visual_embeder_detecron2

    def __len__(self):
        return len(self.dataset)


    def tokenize_data(self, example):
   
        idx = example['id']
        # idx = [idx] if isinstance(idx, str) else idx
        
        encoded_dict = self.tokenizer(example['text'], padding='max_length', max_length=self.sequence_length,padding=True, truncation=True, return_tensors='pt')
        tokens = encoded_dict['input_ids']
        token_type_ids = encoded_dict['token_type_ids']
        attn_mask = encoded_dict['attention_mask']
        
        captioning_encode_dict=self.tokenizer(example['query_1'], padding='max_length', max_length=self.caption_sequence_length,padding=True, truncation=True, return_tensors='pt')
        caption_token=captioning_encode_dict['input_ids']
        caption_token_type_ids=captioning_encode_dict['token_type_ids']
        caption_attn_mask=captioning_encode_dict['attention_mask']

        targets = torch.tensor(example['label']).type(torch.int64)

        ## Get Visual Embeddings
        try:
            if self.visual_embed_model=='vit':
                #TODO: make it work
                img = example['img'].split('/')[-1]
                img = Image.open(os.path.join(self.img_dir , img))
                img = np.array(img)
                img = img[...,:3]
                inputs = self.feature_extractor(images=img, return_tensors="pt")
                outputs = self.feature_model(**inputs.to('cuda'))
                visual_embeds = outputs.last_hidden_state
                visual_embeds = visual_embeds.cpu() #
            elif self.visual_embed_model=='detectron2':
                visual_embeds = self.visualembedder.visual_embeds_detectron2([cv2.imread(os.path.join(self.img_dir, example['img'].split('/')[-1]))])[0]

        except:
            # print("Error with Id: ", idx)
            if self.visual_embed_model=='vit':
                visual_embeds = np.zeros(shape=(197, 768), dtype=float)
            elif self.visual_embed_model=='detectron2':
                visual_embeds = np.zeros(shape=(100, 1024), dtype=float)

        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)

        inputs={"input_ids": tokens.squeeze(),
            "attention_mask": attn_mask.squeeze(),
            "token_type_ids": token_type_ids.squeeze(),
            "visual_embeds": visual_embeds.squeeze(),
            "visual_token_type_ids": visual_token_type_ids.squeeze(),
            "visual_attention_mask": visual_attention_mask.squeeze(),
            "label": targets.squeeze(),
            "caption_input_ids": caption_token.squeeze(),
            "caption_attention_mask": caption_attn_mask.squeeze(),
            "caption_token_type_ids": caption_token_type_ids.squeeze()
        }
        
        return inputs
  
    def __getitem__(self, index):
        inputs = self.tokenize_data(self.dataset[index])
        
        if self.print_text:
            for k in inputs.keys():
                print(k, inputs[k].shape, inputs[k].dtype)

        return inputs

## Fine-tune Model

### Model Architecture

In [30]:
from transformers import BertTokenizer, VisualBertModel, TrainingArguments, Trainer, VisualBertConfig
from torch.nn import CrossEntropyLoss
import torch.nn as nn
from transformers import BertTokenizer, BertModel
class VisualBERTClassifier(torch.nn.Module):
    def __init__(self, visual_embedder='vit'):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(VisualBERTClassifier, self).__init__()
        configuration = VisualBertConfig.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre',
                                                hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
        self.visualbert = VisualBertModel.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre', config=configuration)
        self.bertmodel = BertModel.from_pretrained('bert-base-uncased')
        for param in self.visualbert.parameters():
            param.requires_grad = False
        for param in self.bertmodel.parameters():
            param.requires_grad = False

        if visual_embedder=='vit':
            self.embed_cls = nn.Linear(768, 1024)
        elif visual_embedder=='detectron2':
            self.embed_cls = nn.Linear(1024, 1024)
        # self.visualbert = VisualBertModel.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre')
        self.num_labels = 2
        self.dropout = nn.Dropout(0.3)
        self.cls=  nn.Linear(768, self.num_labels)

        # TODO: Calculate the weights for the loss function and weight balanced loss
        # nSamples = [5178, 2849]
        # normedWeights = [1 - (x / sum(nSamples)) for x in nSamples]
        # self.loss_fct = CrossEntropyLoss(weight=torch.FloatTensor(normedWeights))
        self.loss_fct = CrossEntropyLoss()
        
    
    def forward(self, input_ids, attention_mask, token_type_ids, visual_embeds, visual_attention_mask,
                visual_token_type_ids, labels,caption_input_ids, caption_attention_mask, caption_token_type_ids):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        visual_embeds_cls = self.embed_cls(visual_embeds)
        with torch.no_grad():
            outputs = self.visualbert(
                    input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    visual_embeds=visual_embeds_cls,
                    visual_attention_mask=visual_attention_mask,
                    visual_token_type_ids=visual_token_type_ids,
                )
        
        visualbert_embedding = outputs[1]

        
        with torch.no_grad():
            caption_outputs = self.bert_model(caption_input_ids, attention_mask=caption_attention_mask, token_type_ids=caption_token_type_ids)
                
        # Get the embeddings of the [CLS] token
        caption_embeddings = caption_outputs.last_hidden_state[:,0,:].tolist() # output is a context vector of 768 dimensions
        

        ##TODO: add fusion model here
        logits = self.cls(#TODO)#change classification layer based on you model
        ##
        
        reshaped_logits = logits.view(-1, self.num_labels)
        loss = self.loss_fct(reshaped_logits, labels.view(-1))
      
        return loss, reshaped_logits

##### To check model architecture and shape for each layer

In [None]:
model=VisualBERTClassifier()
for layer_name, params in model.named_parameters():
    print(layer_name, params.shape)

In [31]:
from sklearn.metrics import roc_auc_score
from datasets import load_metric
acc_metric = load_metric('accuracy')
f1_metric = load_metric('f1')
precision_metric = load_metric('precision')
recall_metric = load_metric('recall')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels)
    recall = recall_metric.compute(predictions=predictions, references=labels)
    auc_score = roc_auc_score(labels, predictions)
    return {"accuracy": acc['accuracy'], "auroc": auc_score,'f1':f1['f1'],'precision':precision['precision'],'recall':recall['recall']} 

##### Define training hyperparameters and all model used

In [35]:
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader
from transformers import BertTokenizer, VisualBertForPreTraining, AutoTokenizer
import time

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_data_path='../data/hateful_memes/train_df_wQuery_.jsonl'
validation_data_path='../data/hateful_memes/dev_seen_df_wQuery_.jsonl'
img_inpainted_dir='../data/hateful_memes/img_inpainted'
visual_embed_model='vit'
## For visual_embed_model='detectron2'
# cfg_path="COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"
# MIN_BOXES=10 
# MAX_BOXES=100
# visualembedder_detectron2=VisualEmbedder(cfg_path=cfg_path, min_boxes=MIN_BOXES, max_boxes=MAX_BOXES)
##
output_dir=os.path.join('model-checkpoint', f'visualbert_{visual_embed_model}_{time.strftime("%Y%m%d%H%M")}')

batch_size = 24
seq_len = 50
model = VisualBERTClassifier()
model = model.cuda()


args = TrainingArguments(
    output_dir = output_dir,
    seed = 110, 
    evaluation_strategy = "steps",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs= 3,
    weight_decay=0.05,
    load_best_model_at_end=True,
    metric_for_best_model="auroc",
    eval_steps = 250,
    save_steps = 500,
    fp16 = False,
    report_to="tensorboard"
)

trainer = Trainer(
    model,
    args,
    train_dataset = HatefulMemesData(train_data_path, img_inpainted_dir, tokenizer, sequence_length=seq_len, visual_embed_model=visual_embed_model),
    eval_dataset =  HatefulMemesData(validation_data_path, img_inpainted_dir,tokenizer, sequence_length=seq_len, visual_embed_model=visual_embed_model),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



##### Train + evaluation on validation state

In [36]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Auroc,F1,Precision,Recall
250,No log,0.721819,0.506,0.500144,0.023715,0.5,0.012146
500,0.676000,0.731503,0.508,0.502072,0.016,0.666667,0.008097
750,0.676000,0.735651,0.508,0.502024,0.008065,1.0,0.004049
1000,0.661600,0.737463,0.508,0.502024,0.008065,1.0,0.004049


TrainOutput(global_step=1065, training_loss=0.6695632540564023, metrics={'train_runtime': 1384.2411, 'train_samples_per_second': 18.422, 'train_steps_per_second': 0.769, 'total_flos': 0.0, 'train_loss': 0.6695632540564023, 'epoch': 3.0})

##### Inference on test dataset

In [None]:
##TODO: Inference and evaluate on test data