In [None]:
from PIL import Image
import pandas as pd
import jsonlines
import os


### Create Directories

In [176]:
import os 

parent_directory = os.path.dirname(os.path.dirname(os.getcwd()))
data_directory = os.path.join(parent_directory,'data')

# Open the JSONL file in read mode using jsonlines
with jsonlines.open(os.path.join(data_directory,'dev.jsonl')) as reader:
    # Iterate over each line in the file
    for data in reader:
       
        print(data)

{'id': 8291, 'img': 'img/08291.png', 'label': 1, 'text': 'white people is this a shooting range'}
{'id': 46971, 'img': 'img/46971.png', 'label': 1, 'text': 'bravery at its finest'}
{'id': 3745, 'img': 'img/03745.png', 'label': 1, 'text': 'your order comes to $37.50 and your white privilege discount brings the total to $37.50'}
{'id': 83745, 'img': 'img/83745.png', 'label': 1, 'text': 'it is time.. to send these parasites back to the desert'}
{'id': 80243, 'img': 'img/80243.png', 'label': 1, 'text': 'mississippi wind chime'}
{'id': 5279, 'img': 'img/05279.png', 'label': 1, 'text': "knowing white people , that's probably the baby father"}
{'id': 1796, 'img': 'img/01796.png', 'label': 1, 'text': 'life hack #23 how to get stoned with no weed'}
{'id': 53046, 'img': 'img/53046.png', 'label': 1, 'text': "you've heard of elf on a shelf, now get ready for"}
{'id': 82301, 'img': 'img/82301.png', 'label': 1, 'text': 'cooooooooooooon!!!!'}
{'id': 31752, 'img': 'img/31752.png', 'label': 1, 'text': 

### Load in Transformer model from HuggingFace

##### VisualBert and Tokenizer

In [177]:
import torch 
from transformers import BertTokenizer, VisualBertModel

In [178]:
model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [179]:
model.config

VisualBertConfig {
  "_name_or_path": "uclanlp/visualbert-vqa-coco-pre",
  "architectures": [
    "VisualBertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "bypass_transformer": false,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "visual_bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "special_visual_initialize": true,
  "transformers_version": "4.39.3",
  "type_vocab_size": 2,
  "visual_embedding_dim": 2048,
  "vocab_size": 30522
}

##### Swinv2Model for image processing

In [None]:
from transformers import Swinv2Model,AutoImageProcessor

model = Swinv2Model.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")#.from_pretrained('yusx-swapp/ofm-swinv2-base-patch4-window7-cifar100')
image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")#.from_pretrained('yusx-swapp/ofm-swinv2-base-patch4-window7-cifar100')

##### Create DataFrame from train.jsonl

In [None]:
import os 

parent_directory = os.path.dirname(os.path.dirname(os.getcwd()))
data_directory = os.path.join(parent_directory,'data')

train_dataset = pd.DataFrame(columns=['id','img','label','text'])

# Open the JSONL file in read mode using jsonlines
with jsonlines.open(os.path.join(data_directory,'train.jsonl')) as reader:
    # Iterate over each line in the file
    for data in reader:
        # Process the data as needed
        data_df = pd.DataFrame([data])
        train_dataset = pd.concat([train_dataset,data_df]).reset_index(drop=True)


#### Swinv2 vs. ViT

In [None]:
from transformers import AutoImageProcessor, Swinv2Model
import torch
from datasets import load_dataset
import numpy as np
from PIL import Image

image = Image.open(os.path.join(data_directory,train_dataset.loc[0,'img']))
image_array = np.array(image)
tensor = torch.tensor(image_array)


image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")
model = Swinv2Model.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")

inputs = image_processor(tensor, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

In [165]:
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
feature_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k').to('cuda')

inputs = feature_extractor(images=tensor, return_tensors="pt")
outputs = feature_model(**inputs.to('cuda'))

outputs['last_hidden_state'].shape




torch.Size([1, 197, 768])

#### Create input dataset for VisualBERT by connecting all the pieces

In [180]:
train_dataset

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."
...,...,...,...,...
8495,10423,img/10423.png,1,nobody wants to hang auschwitz me
8496,98203,img/98203.png,1,when god grants you a child after 20 years of ...
8497,36947,img/36947.png,1,gays on social media: equality! body positivit...
8498,16492,img/16492.png,1,having a bad day? you could be a siamese twin ...


In [183]:
train_dataset.loc[0,'text']

'its their character not their color that matters'

In [185]:
input_text = train_dataset.loc[0,'text']
input_text_tokenized = tokenizer(input_text, return_tensors='pt', padding='max_length', max_length=512, truncation=True)

