In [1]:
# !pip install torch
# !pip install transformers
# !pip install sentence-transformers

In [2]:
# pip install --upgrade transformers

In [16]:
import pandas as pd

# f = pd.read_json(file_path, lines=True)
df_image_train = pd.read_json("data/train.jsonl", lines=True)
df_image_val = pd.read_json("data/dev.jsonl", lines=True)
df_image_test = pd.read_json("data/test.jsonl", lines=True)
df_image_train.head()

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


In [17]:
import warnings
warnings.filterwarnings("ignore")

In [18]:
print(df_image_train.head())

      id            img  label  \
0  42953  img/42953.png      0   
1  23058  img/23058.png      0   
2  13894  img/13894.png      0   
3  37408  img/37408.png      0   
4  82403  img/82403.png      0   

                                                text  
0   its their character not their color that matters  
1  don't be afraid to love again everyone is not ...  
2                           putting bows on your pet  
3  i love everything and everybody! except for sq...  
4  everybody loves chocolate chip cookies, even h...  


In [19]:
from transformers import AutoTokenizer, AutoModel
import torchvision.models as models
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from PIL import Image
import pandas as pd
import os
import torchvision.transforms as T
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Transformers model for text
text_model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Torchvision model for images
image_model = models.resnet101(pretrained=True)
image_model = nn.Sequential(*list(image_model.children())[:-1])  # Remove the last FC layer

def collate_fn(batch):
    images, texts, labels = zip(*batch)
    input_ids = pad_sequence([t['input_ids'] for t in texts])
    attention_mask = pad_sequence([t['attention_mask'] for t in texts])

    input_ids = torch.as_tensor(input_ids, dtype=torch.long)
    attention_mask = torch.as_tensor(attention_mask, dtype=torch.long)
    labels = torch.as_tensor(labels, dtype=torch.long)
    images = torch.stack(images).transpose(0, 1)
    
    return images, {'input_ids': input_ids, 'attention_mask': attention_mask}, labels

class HatefulMemesDataset(Dataset):
    def __init__(self, json_file, img_dir, transforms=None):
        self.data = pd.read_json(json_file, lines=True)
        self.img_dir = img_dir
        self.transforms = transforms

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_id = self.data.loc[idx, 'img']
        img_path = os.path.join(self.img_dir, img_id)
        image = Image.open(img_path).convert('RGB')

        if self.transforms:
            image = self.transforms(image)
        
        text = self.data.loc[idx, 'text']
        text = tokenizer(text, return_tensors='pt', padding="max_length", truncation=True, max_length=64)
        label = self.data.loc[idx, 'label']
        return image, text, label

# Define your transforms
transforms = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = HatefulMemesDataset(json_file="data/train.jsonl",
                              img_dir="data",
                              transforms=transforms)

# Compute class weights
class_counts = dataset.data['label'].value_counts().to_dict()
total_samples = sum(class_counts.values())
class_weights = {cls: total_samples/count for cls, count in class_counts.items()}

# DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

val_dataset = HatefulMemesDataset(json_file="data/dev.jsonl",
                                  img_dir="data",
                                  transforms=transforms)

val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

# Model for classification
class CombinedModel(nn.Module):
    def __init__(self, text_model, image_model):
        super(CombinedModel, self).__init__()
        self.text_model = text_model
        self.image_model = image_model
        self.classifier = nn.Linear(text_model.config.hidden_size + 2048, 2)

    def forward(self, image, input_ids, attention_mask):
        text_features = self.text_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        image_features = self.image_model(image).view(image.size(0), -1)
        combined = torch.cat((text_features, image_features), dim=1)
        logits = self.classifier(combined)
        return logits

model = CombinedModel(text_model, image_model)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

checkpoint_path = "combined_model_resnet_bert_epoch_3.pth"
if torch.cuda.is_available():
    model.load_state_dict(torch.load(checkpoint_path))
else:
    model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device('cpu')))

# Prepare class weights
weights = torch.tensor([class_weights[cls] for cls in sorted(class_counts.keys())], dtype=torch.float)
weights = weights.to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for images, texts, labels in dataloader:
        images = images.permute(1, 0, 2, 3)
        images = images.to(device)
        input_ids = texts['input_ids'].squeeze().to(device)
        attention_mask = texts['attention_mask'].squeeze().to(device)
        labels = labels.to(device)
        
        logits = model(images, input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Validation after each epoch
    model.eval()
    with torch.no_grad():
        val_losses = []
        correct = 0
        total = 0
        all_labels = []
        all_predictions = []
        for images, texts, labels in val_dataloader:
            images = images.permute(1, 0, 2, 3)
            images = images.to(device)
            input_ids = texts['input_ids'].squeeze().to(device)
            attention_mask = texts['attention_mask'].squeeze().to(device)
            labels = labels.to(device)

            logits = model(images, input_ids, attention_mask)
            val_loss = criterion(logits, labels)
            val_losses.append(val_loss.item())
            
            _, predicted = torch.max(logits.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

        avg_val_loss = sum(val_losses) / len(val_losses)
        val_acc = correct / total
        precision = precision_score(all_labels, all_predictions, average='weighted')
        recall = recall_score(all_labels, all_predictions, average='weighted')
        roc_auc = roc_auc_score(all_labels, all_predictions)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}, Val Loss: {avg_val_loss}, Val Accuracy: {val_acc * 100}%, Precision: {precision * 100}%, Recall: {recall * 100}%, AUROC: {roc_auc}')

    # Save the model after each epoch
    if (epoch + 1) % 5 == 0:
        torch.save(model.state_dict(), f"combined_model_resnet_bert_class_weights_epoch_{epoch+1}.pth")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch [1/20], Loss: 0.5685920119285583, Val Loss: 0.7641576902260856, Val Accuracy: 50.4%, Precision: 50.45657096939149%, Recall: 50.4%, AUROC: 0.504
Epoch [2/20], Loss: 0.31835445761680603, Val Loss: 0.9863109810957833, Val Accuracy: 52.6%, Precision: 53.81285342218259%, Recall: 52.6%, AUROC: 0.526
Epoch [3/20], Loss: 0.5039176940917969, Val Loss: 0.8632743278193096, Val Accuracy: 52.6%, Precision: 53.8643551877482%, Recall: 52.6%, AUROC: 0.526
Epoch [4/20], Loss: 0.4317115843296051, Val Loss: 0.769026108677425, Val Accuracy: 55.2%, Precision: 55.3229821802935%, Recall: 55.2%, AUROC: 0.5519999999999999
Epoch [5/20], Loss: 0.6014852523803711, Val Loss: 0.8528119131686196, Val Accuracy: 53.0%, Precision: 53.48182949248853%, Recall: 53.0%, AUROC: 0.5299999999999999
Epoch [6/20], Loss: 0.4335237443447113, Val Loss: 0.8579541265018402, Val Accuracy: 53.400000000000006%, Precision: 53.756341588446375%, Recall: 53.400000000000006%, AUROC: 0.5339999999999999
Epoch [7/20], Loss: 0.574969172477