In [None]:
from PIL import Image
import pandas as pd
import jsonlines
import os



### Create Directories

In [None]:
import os 

parent_directory = os.path.dirname(os.path.dirname(os.getcwd()))
data_directory = os.path.join(parent_directory,'data')

# Open the JSONL file in read mode using jsonlines
with jsonlines.open(os.path.join(data_directory,'dev.jsonl')) as reader:
    # Iterate over each line in the file
    for data in reader:
       
        print(data)

### Load in Transformer model from HuggingFace

##### VisualBert and Tokenizer

In [None]:
import torch 
from transformers import BertTokenizer, VisualBertModel

In [None]:
model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [None]:
model.config

##### Swinv2Model for image processing

In [None]:
from transformers import Swinv2Model,AutoImageProcessor

viz_model = Swinv2Model.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")#.from_pretrained('yusx-swapp/ofm-swinv2-base-patch4-window7-cifar100')
image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")#.from_pretrained('yusx-swapp/ofm-swinv2-base-patch4-window7-cifar100')

##### Create DataFrame from train.jsonl

In [None]:
import os 

parent_directory = os.path.dirname(os.path.dirname(os.getcwd()))
data_directory = os.path.join(parent_directory,'data')

train_dataset = pd.DataFrame(columns=['id','img','label','text'])

# Open the JSONL file in read mode using jsonlines
with jsonlines.open(os.path.join(data_directory,'train.jsonl')) as reader:
    # Iterate over each line in the file
    for data in reader:
        # Process the data as needed
        data_df = pd.DataFrame([data])
        train_dataset = pd.concat([train_dataset,data_df]).reset_index(drop=True)


#### Swinv2 vs. ViT

In [None]:
from transformers import AutoImageProcessor, Swinv2Model
import torch
from datasets import load_dataset
import numpy as np
from PIL import Image

image = Image.open(os.path.join(data_directory,train_dataset.loc[0,'img']))
image_array = np.array(image)
tensor = torch.tensor(image_array)


image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")
viz_model = Swinv2Model.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")

inputs = image_processor(tensor, return_tensors="pt")

with torch.no_grad():
    outputs = viz_model(**inputs)

visual_embeds = outputs.last_hidden_state
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)

In [None]:
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
feature_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

inputs = feature_extractor(images=tensor, return_tensors="pt")
outputs = feature_model(**inputs)#.to('cuda'))

visual_embeds = outputs['last_hidden_state']
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)


#### Create input dataset for VisualBERT by connecting all the pieces

In [None]:
train_dataset

In [None]:
train_dataset.loc[0,'text']

In [None]:
input_text = train_dataset.loc[0,'text']
input_text_tokenized = tokenizer(input_text, return_tensors='pt', padding='max_length', max_length=512, truncation=True)



In [None]:
input_text_tokenized.update(
    {
        "visual_embeds": visual_embeds,
        "visual_token_type_ids": visual_token_type_ids,
        "visual_attention_mask": visual_attention_mask,
    }
)

In [None]:
model

In [None]:
outputs = model(**input_text_tokenized)

In [None]:
t1 = torch.rand(36,1024)
t2 = torch.rand(2048,768)

print(t1.shape)
print(t2.shape)

print(torch.matmul(t1,t2))

In [None]:
model.config

In [None]:
image_array[0].shape

In [None]:
visual_embeds.shape

In [None]:
input_text_tokenized['input_ids'].shape

In [None]:
model.config