In [2]:
from PIL import Image
import pandas as pd
import jsonlines
import os



### Create Directories

In [3]:
import os 

parent_directory = os.path.dirname(os.path.dirname(os.getcwd()))
data_directory = os.path.join(parent_directory,'data')

# Open the JSONL file in read mode using jsonlines
with jsonlines.open(os.path.join(data_directory,'dev.jsonl')) as reader:
    # Iterate over each line in the file
    for data in reader:
       
        print(data)

{'id': 8291, 'img': 'img/08291.png', 'label': 1, 'text': 'white people is this a shooting range'}
{'id': 46971, 'img': 'img/46971.png', 'label': 1, 'text': 'bravery at its finest'}
{'id': 3745, 'img': 'img/03745.png', 'label': 1, 'text': 'your order comes to $37.50 and your white privilege discount brings the total to $37.50'}
{'id': 83745, 'img': 'img/83745.png', 'label': 1, 'text': 'it is time.. to send these parasites back to the desert'}
{'id': 80243, 'img': 'img/80243.png', 'label': 1, 'text': 'mississippi wind chime'}
{'id': 5279, 'img': 'img/05279.png', 'label': 1, 'text': "knowing white people , that's probably the baby father"}
{'id': 1796, 'img': 'img/01796.png', 'label': 1, 'text': 'life hack #23 how to get stoned with no weed'}
{'id': 53046, 'img': 'img/53046.png', 'label': 1, 'text': "you've heard of elf on a shelf, now get ready for"}
{'id': 82301, 'img': 'img/82301.png', 'label': 1, 'text': 'cooooooooooooon!!!!'}
{'id': 31752, 'img': 'img/31752.png', 'label': 1, 'text': 

### Load in Transformer model from HuggingFace

##### VisualBert and Tokenizer

In [4]:
import torch 
from transformers import BertTokenizer, VisualBertModel

model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [6]:
model.config

VisualBertConfig {
  "_name_or_path": "uclanlp/visualbert-vqa-coco-pre",
  "architectures": [
    "VisualBertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "bypass_transformer": false,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "visual_bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "special_visual_initialize": true,
  "transformers_version": "4.39.3",
  "type_vocab_size": 2,
  "visual_embedding_dim": 2048,
  "vocab_size": 30522
}

##### Swinv2Model for image processing

In [7]:
from transformers import Swinv2Model,AutoImageProcessor

viz_model = Swinv2Model.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")#.from_pretrained('yusx-swapp/ofm-swinv2-base-patch4-window7-cifar100')
image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")#.from_pretrained('yusx-swapp/ofm-swinv2-base-patch4-window7-cifar100')



##### Create DataFrame from train.jsonl

In [8]:
import os 

parent_directory = os.path.dirname(os.path.dirname(os.getcwd()))
data_directory = os.path.join(parent_directory,'data')

train_dataset = pd.DataFrame(columns=['id','img','label','text'])

# Open the JSONL file in read mode using jsonlines
with jsonlines.open(os.path.join(data_directory,'train.jsonl')) as reader:
    # Iterate over each line in the file
    for data in reader:
        # Process the data as needed
        data_df = pd.DataFrame([data])
        train_dataset = pd.concat([train_dataset,data_df]).reset_index(drop=True)


#### Swinv2 vs. ViT

In [9]:
from transformers import AutoImageProcessor, Swinv2Model
import torch
from datasets import load_dataset
import numpy as np
from PIL import Image

image = Image.open(os.path.join(data_directory,train_dataset.loc[0,'img']))
image_array = np.array(image)
tensor = torch.tensor(image_array)


image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")
viz_model = Swinv2Model.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")

inputs = image_processor(tensor, return_tensors="pt")

with torch.no_grad():
    outputs = viz_model(**inputs)

visual_embeds = outputs.last_hidden_state
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)



In [10]:
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
feature_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

inputs = feature_extractor(images=tensor, return_tensors="pt")
outputs = feature_model(**inputs)#.to('cuda'))

visual_embeds = outputs['last_hidden_state']
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)




#### Create input dataset for VisualBERT by connecting all the pieces

In [11]:
train_dataset

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."
...,...,...,...,...
8495,10423,img/10423.png,1,nobody wants to hang auschwitz me
8496,98203,img/98203.png,1,when god grants you a child after 20 years of ...
8497,36947,img/36947.png,1,gays on social media: equality! body positivit...
8498,16492,img/16492.png,1,having a bad day? you could be a siamese twin ...


In [12]:
train_dataset.loc[0,'text']

'its their character not their color that matters'

In [13]:
input_text = train_dataset.loc[0,'text']
input_text_tokenized = tokenizer(input_text, return_tensors='pt', padding='max_length', max_length=512, truncation=True)



In [14]:
input_text_tokenized.update(
    {
        "visual_embeds": visual_embeds,
        "visual_token_type_ids": visual_token_type_ids,
        "visual_attention_mask": visual_attention_mask,
    }
)

### Components of VisualBERT input

      1. Text input 
      2. Image input

    1. Text input requires three pieces: input_ids, attention_mask, token_type_ids

    - input_ids: represent the token IDs of the input tokens after tokenization. Each token in the input text is converted into a numerical ID based on the tokenizer's vocabulary.

    - attention_mask: indicates which tokens should be attended to during processing. Binary mask where each position corresponds to a token in the input.

    - token_type_ids: represent the segment IDs for token types in the context of sequence pairs.

    2. Image input requires three pieces: visual_embeds, visual_token_type_ids, visual_attention_mask

    - visual_embeds: represents the embeddings of visual features in the model.

    - visual_token_type_ids: represent the token type IDs for visual toekens. In multi-modal transformers distinguishes between tokens representing visual features and tokens representing text

    - vision_attention_mask: guide the attention mechanism for visual tokens. Specifies which tokens should be attended to during processing.

### Create Dataset Class

In [35]:
train_dataset['text'].to_list()

['its their character not their color that matters',
 "don't be afraid to love again everyone is not like your ex",
 'putting bows on your pet',
 'i love everything and everybody! except for squirrels i hate squirrels',
 'everybody loves chocolate chip cookies, even hitler',
 'go sports! do the thing! win the points!',
 "fine you're right. now can we fucking drop it?",
 'tattoos are bad for your health i know 5 million people that had tattoos and they all died',
 'how long can i run? till the chain tightens',
 'what is he hiding? we need to see his tax returns! let me stop you right there hillary you deleted 30,000 emails, used bleach bit on hard drives, then destroyed phones with hammers you have no right to talk about people hiding anything truth uncensored',
 'jew mad? get fuhrerious!',
 'removes polish with chemicals, nobody bats an eye',
 'brother... a day without a blast is a day wasted',
 "when you're the only dog left at the dog shelter and the black family wants to adopt you",

In [36]:
input_text = train_dataset.loc[0,'text']
input_text_tokenized = tokenizer(train_dataset['text'].to_list(), return_tensors='pt', padding='max_length', max_length=512, truncation=True)

In [41]:
len(input_text_tokenized['input_ids'])

8500

In [45]:
np.array(train_dataset['img'])

array(['img/42953.png', 'img/23058.png', 'img/13894.png', ...,
       'img/36947.png', 'img/16492.png', 'img/15937.png'], dtype=object)

In [None]:
from transformers import AutoImageProcessor, Swinv2Model
import torch
from datasets import load_dataset
import numpy as np
from PIL import Image

image = Image.open(os.path.join(data_directory,train_dataset.loc[0,'img']))
image_array = np.array(image)
tensor = torch.tensor(image_array)


image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")
viz_model = Swinv2Model.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")

inputs = image_processor(tensor, return_tensors="pt")

with torch.no_grad():
    outputs = viz_model(**inputs)

visual_embeds = outputs.last_hidden_state
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)

In [30]:

import torch 
from transformers import BertTokenizer, VisualBertModel

model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')

class DatasetBuilder():
    def __init__(self,model = None,tokenizer = None):
        #self.model = model
        self.parent_directory = os.path.dirname(os.path.dirname(os.getcwd()))
        self.data_directory = os.path.join(parent_directory,'data')
    
        #self.model = VisualBertModel.from_pretrained(model)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer)
        self.image_processor = AutoImageProcessor.from_pretrained(model)
        self.viz_model = Swinv2Model.from_pretrained(model)


    def create_dataframe(self,json_file):
        dataset = pd.DataFrame(columns=['id','img','label','text'])
        with jsonlines.open(os.path.join(data_directory,json_file)) as reader:
            for data in reader:
                data_df = pd.DataFrame([data])
                dataset = pd.concat([dataset,data_df]).reset_index(drop=True)

        return dataset
    

    def tokenizer(self,dataset):
        text_input = tokenizer(dataset['text'].to_list(), return_tensors='pt', padding='max_length', max_length=512, truncation=True)
        









In [32]:
dataset = DatasetBuilder().create_dataframe('train.jsonl')

In [49]:
data_directory

'/home/jramos/Documents/OMSCS/CS-7643 Deep Learning/Project/Hateful-Memes/data'

In [53]:
(data_directory+'/'+train_dataset['img'])[0]

'/home/jramos/Documents/OMSCS/CS-7643 Deep Learning/Project/Hateful-Memes/data/img/42953.png'

In [54]:
def load_image(filepath):
        image = Image.open(filepath)
        image_array = np.array(image)
        return image_array

    # Apply the load_image function to the image paths in the DataFrame
(data_directory+'/'+train_dataset['img']).apply(load_image)


0       [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...
1       [[[9, 9, 9], [59, 60, 59], [59, 60, 59], [59, ...
2       [[[56, 55, 54], [90, 88, 87], [91, 88, 87], [9...
3       [[[3, 3, 3], [16, 16, 16], [17, 17, 17], [17, ...
4       [[[0, 0, 0], [44, 44, 44], [49, 49, 49], [49, ...
                              ...                        
8495    [[[0, 0, 0], [3, 3, 3], [4, 4, 4], [5, 5, 5], ...
8496    [[[0, 0, 0], [34, 40, 44], [39, 45, 50], [39, ...
8497    [[[143, 142, 138], [149, 150, 143], [149, 149,...
8498    [[[228, 227, 247], [227, 226, 247], [227, 226,...
8499    [[[3, 3, 2], [9, 8, 8], [9, 8, 8], [9, 8, 8], ...
Name: img, Length: 8500, dtype: object