In [None]:
from PIL import Image
import pandas as pd
import jsonlines
import os



In [None]:
import os 

parent_directory = os.path.dirname(os.path.dirname(os.getcwd()))
data_directory = os.path.join(parent_directory,'data')

# Open the JSONL file in read mode using jsonlines
with jsonlines.open(os.path.join(data_directory,'dev.jsonl')) as reader:
    # Iterate over each line in the file
    for data in reader:
       
        print(data)

### Create Directories

### Load in Transformer model from HuggingFace

##### VisualBert and Tokenizer

In [None]:
import torch 
from transformers import BertTokenizer, VisualBertModel

model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')

##### Swinv2Model for image processing

In [None]:
from transformers import Swinv2Model,AutoImageProcessor

viz_model = Swinv2Model.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")#.from_pretrained('yusx-swapp/ofm-swinv2-base-patch4-window7-cifar100')
image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")#.from_pretrained('yusx-swapp/ofm-swinv2-base-patch4-window7-cifar100')

##### Create DataFrame from train.jsonl

In [None]:
import os 
import pandas as pd 
import jsonlines

parent_directory = os.path.dirname(os.getcwd())
data_directory = os.path.join(parent_directory,'data')

train_dataset = pd.DataFrame(columns=['id','img','label','text'])

# Open the JSONL file in read mode using jsonlines
with jsonlines.open(os.path.join(data_directory,'train.jsonl')) as reader:
    # Iterate over each line in the file
    for data in reader:
        # Process the data as needed
        data_df = pd.DataFrame([data])
        train_dataset = pd.concat([train_dataset,data_df]).reset_index(drop=True)


#### Swinv2 vs. ViT

In [None]:
from transformers import AutoImageProcessor, Swinv2Model
import torch
from datasets import load_dataset
import numpy as np
from PIL import Image
import os 

image = Image.open(os.path.join(data_directory,train_dataset.loc[0,'img']))
image_array = np.array(image)
tensor = torch.tensor(image_array)


image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")
viz_model = Swinv2Model.from_pretrained("microsoft/swinv2-base-patch4-window12-192-22k")

inputs = image_processor(tensor, return_tensors="pt")

with torch.no_grad():
    outputs = viz_model(**inputs)

visual_embeds = outputs.last_hidden_state
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
print(visual_embeds.shape)

In [None]:
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
feature_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

inputs = feature_extractor(images=tensor, return_tensors="pt")
outputs = feature_model(**inputs)#.to('cuda'))

visual_embeds = outputs['last_hidden_state']
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
print(visual_embeds.shape)

#### Create input dataset for VisualBERT by connecting all the pieces

In [None]:
input_text = train_dataset.loc[0,'text']
input_text_tokenized = tokenizer(input_text, return_tensors='pt', padding='max_length', max_length=512, truncation=True)
input_text_tokenized.update(
    {
        "visual_embeds": visual_embeds,
        "visual_token_type_ids": visual_token_type_ids,
        "visual_attention_mask": visual_attention_mask
    }
)


In [None]:
import torch 
from transformers import BertTokenizer, VisualBertModel

model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
#model = VisualBertModel.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre')
output = model(**input_text_tokenized)

### Components of VisualBERT input

      1. Text input 
      2. Image input

    1. Text input requires three pieces: input_ids, attention_mask, token_type_ids

    - input_ids: represent the token IDs of the input tokens after tokenization. Each token in the input text is converted into a numerical ID based on the tokenizer's vocabulary.

    - attention_mask: indicates which tokens should be attended to during processing. Binary mask where each position corresponds to a token in the input.

    - token_type_ids: represent the segment IDs for token types in the context of sequence pairs.

    2. Image input requires three pieces: visual_embeds, visual_token_type_ids, visual_attention_mask

    - visual_embeds: represents the embeddings of visual features in the model.

    - visual_token_type_ids: represent the token type IDs for visual toekens. In multi-modal transformers distinguishes between tokens representing visual features and tokens representing text

    - vision_attention_mask: guide the attention mechanism for visual tokens. Specifies which tokens should be attended to during processing.

### Import Dataset Class

    Probably dont need what's below

In [None]:

import torch 
from transformers import BertTokenizer, VisualBertModel
from transformers import Swinv2Model,AutoImageProcessor
import os 
import pandas as pd
import jsonlines
from PIL import Image
import numpy as np


class DatasetBuilder():
    def __init__(self,model = None,tokenizer = None,json_file = None):
        self.parent_directory = os.path.dirname(os.getcwd())
        self.data_directory = os.path.join(self.parent_directory,'data')
        self.json_file = json_file
        print(tokenizer)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer)
        self.image_processor = AutoImageProcessor.from_pretrained(model)
        self.viz_model = Swinv2Model.from_pretrained(model)
        self.dataset = self.create_dataframe()

    def create_dataframe(self):
        dataset = pd.DataFrame(columns=['id','img','label','text'])
        with jsonlines.open(os.path.join(self.data_directory,self.json_file)) as reader:
            for data in reader:
                data_df = pd.DataFrame([data])
                dataset = pd.concat([dataset,data_df]).reset_index(drop=True)

        dataset['img'] = (self.data_directory+'/'+dataset['img']).apply(self.load_image)
        return dataset
    

    def tokenize_data(self,value):
        input = self.tokenizer(value['text'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
        target = torch.tensor(value['label']).type(torch.int64)
        
        
        image = self.image_processor(value['img'], return_tensors="pt")
            
            
        outputs = self.viz_model(**image)

        visual_embeds = outputs.last_hidden_state

       # except:
       #     visual_embeds = np.zeros(shape=(197, 768), dtype=float)

            
        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
        input.update(
            {
                "visual_embeds": visual_embeds,
                "visual_token_type_ids": visual_token_type_ids,
                "visual_attention_mask": visual_attention_mask,
                "label":target
            }
        )

        return input
    
    def get_dataset(self):
        return self.dataset
  
    def __getitem__(self, index):
        inputs = self.tokenize_data(self.dataset.loc[index])
        
        for k in inputs.keys():
            print(k, inputs[k].shape, inputs[k].dtype)

        return inputs

    def __len__(self):
        return len(self.dataset)

    def load_image(self,filepath):
        image = Image.open(filepath)
        image_array = np.array(image)
        return image_array

In [None]:
from transformers import TrainingArguments, Trainer
batch_size = 24
seq_len = 50

metric_name = "auroc"

args = TrainingArguments(
    output_dir = "model-checkpoint",
    seed = 110, 
    evaluation_strategy = "steps",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs= 100,
    weight_decay=0.05,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    eval_steps = 50,
    save_steps = 500,
    fp16 = False,
    gradient_accumulation_steps = 2
)

In [None]:

from sklearn.metrics import roc_auc_score
from datasets import list_metrics, load_metric
metrics_list = list_metrics()
print(metrics_list)


acc_metric = load_metric('accuracy')
f1_metric = load_metric('f1')
precision_metric = load_metric('precision')
recall_metric = load_metric('recall')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels)
    recall = recall_metric.compute(predictions=predictions, references=labels)
    auc_score = roc_auc_score(labels, predictions)
    return {"accuracy": acc['accuracy'], "auroc": auc_score,'f1':f1['f1'],'precision':precision['precision'],'recall':recall['recall']}

In [None]:
from model.visbert import VisualBERT
import torch 
from transformers import BertTokenizer, VisualBertModel

model = VisualBERT()
tokenizer_name = 'google-bert/bert-base-uncased'
swin_model = "microsoft/swinv2-base-patch4-window12-192-22k"
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [None]:
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset = DatasetBuilder(DatasetBuilder(swin_model,tokenizer_name,'train.jsonl')),
    eval_dataset = DatasetBuilder(DatasetBuilder(swin_model,tokenizer_name,'test.jsonl')),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)