In [1]:
import pandas as pd

# f = pd.read_json(file_path, lines=True)
df_image_train = pd.read_json("/kaggle/input/facebook-hateful-meme-dataset/data/train.jsonl", lines=True)
df_image_val = pd.read_json("/kaggle/input/facebook-hateful-meme-dataset/data/dev.jsonl", lines=True)
df_image_test = pd.read_json("/kaggle/input/facebook-hateful-meme-dataset/data/test.jsonl", lines=True)
df_image_train.head()

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


In [2]:
import os
import pandas as pd
import torch
import torchvision.transforms as T
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoFeatureExtractor, AutoModel
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score

# Read JSON files
df_image_train = pd.read_json("/kaggle/input/facebook-hateful-meme-dataset/data/train.jsonl", lines=True)
df_image_val = pd.read_json("/kaggle/input/facebook-hateful-meme-dataset/data/dev.jsonl",lines = True)

# Get image paths
def load_image_paths(df):
    image_paths = [os.path.join("/kaggle/input/facebook-hateful-meme-dataset/data", img_path) for img_path in df['img']]
    return image_paths

# Prepare data
X_train_images = load_image_paths(df_image_train)
y_train = df_image_train['label']

X_val_images = load_image_paths(df_image_val)
y_val = df_image_val['label']

# Dataset
class ImageDataset(Dataset):
    def __init__(self, image_paths, labels, transforms=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transforms = transforms

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')
        if self.transforms:
            image = self.transforms(image)
        label = self.labels[idx]
        return image, label

# Define the transformations
transform = T.Compose([
    T.Resize((224, 224)),  # Resize to the size expected by DINO
    T.ToTensor(),
])

# Create the datasets
train_dataset = ImageDataset(X_train_images, y_train, transforms=transform)

# Create the dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Create the datasets
val_dataset = ImageDataset(X_val_images, y_train, transforms=transform)

# Create the dataloaders
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

# Load pretrained models
extractor = AutoFeatureExtractor.from_pretrained("facebook/dino-vits8")
model = AutoModel.from_pretrained("facebook/dino-vits8")

# Add a classification head to the model
num_classes = 2
model.classifier = nn.Linear(model.config.hidden_size, num_classes)

# Set up loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Training loop
num_epochs = 20
model.train()

for epoch in range(num_epochs):
    for images, labels in train_dataloader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        logits = model.classifier(outputs.pooler_output)

        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    print(f"Epoch: {epoch}, Loss: {loss.item()}")
    
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Do not calculate gradients (saves memory and computation)
        all_labels = []
        all_predictions = []

        for images, labels in val_dataloader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            logits = model.classifier(outputs.pooler_output)

            _, predicted = torch.max(logits, 1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

        precision = precision_score(all_labels, all_predictions, average='macro')
        recall = recall_score(all_labels, all_predictions, average='macro')
        accuracy = accuracy_score(all_labels, all_predictions)

        print(f"Validation Precision: {precision}")
        print(f"Validation Recall: {recall}")
        print(f"Validation Accuracy: {accuracy}")
        
        if (epoch + 1) % 4 == 0:
            torch.save(model.state_dict(), f"vit_final_{epoch+1}.pth")


Downloading (…)rocessor_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)lve/main/config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/86.8M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vits8 and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Loss: 0.7275950908660889
Validation Precision: 0.5081126687435099
Validation Recall: 0.5053333333333333
Validation Accuracy: 0.682
Epoch: 1, Loss: 0.5587728023529053
Validation Precision: 0.5065968630596778
Validation Recall: 0.5066666666666666
Validation Accuracy: 0.628
Epoch: 2, Loss: 0.6088539958000183
Validation Precision: 0.5328947368421053
Validation Recall: 0.532
Validation Accuracy: 0.654
Epoch: 3, Loss: 0.2793712615966797
Validation Precision: 0.5146775632896529
Validation Recall: 0.5066666666666667
Validation Accuracy: 0.708
Epoch: 4, Loss: 0.13934367895126343
Validation Precision: 0.5165762507534659
Validation Recall: 0.5146666666666667
Validation Accuracy: 0.656
Epoch: 5, Loss: 0.16861902177333832
Validation Precision: 0.49588382507903056
Validation Recall: 0.4946666666666667
Validation Accuracy: 0.538
Epoch: 6, Loss: 0.4310924708843231
Validation Precision: 0.5077962577962578
Validation Recall: 0.508
Validation Accuracy: 0.626
Epoch: 7, Loss: 0.000676225521601736

In [13]:
#### Code for Validation
# Path of the saved model
model_path = "vit_final_20.pth"  # modify this to your actual path

# Initialize the same model architecture
model = AutoModel.from_pretrained("facebook/dino-vits8")
model.classifier = nn.Linear(model.config.hidden_size, num_classes)
model = model.to(device)

# Load the saved model parameters
model.load_state_dict(torch.load(model_path))

df_image_test = pd.read_json("/kaggle/input/facebook-hateful-meme-dataset/data/dev.jsonl", lines=True)
X_test_images = load_image_paths(df_image_test)
y_test = df_image_test['label']

# Create the test dataset and dataloader, assuming that you have already loaded the test data
# into X_test_images and y_test
test_dataset = ImageDataset(X_test_images, y_test, transforms=transform)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Testing loop
# Testing loop
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Do not calculate gradients (saves memory and computation)
    all_labels = []
    all_predictions = []
    all_probabilities = []

    for images, labels in test_dataloader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        logits = model.classifier(outputs.pooler_output)

        probabilities = nn.functional.softmax(logits, dim=1)  # probabilities
        _, predicted = torch.max(logits, 1)  # predicted class
        
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())
        all_probabilities.extend(probabilities[:, 1].cpu().numpy())  # probabilities of class 1

    precision = precision_score(all_labels, all_predictions, average='macro')
    recall = recall_score(all_labels, all_predictions, average='macro')
    accuracy = accuracy_score(all_labels, all_predictions)
    roc_auc = roc_auc_score(all_labels, all_probabilities)  # calculate roc_auc

    print(f"Validation Precision: {precision}")
    print(f"Validation Recall: {recall}")
    print(f"Validation Accuracy: {accuracy}")
    print(f"Validation AUROC: {roc_auc}")

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vits8 and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Validation Precision: 0.5213476536556672
Validation Recall: 0.518
Validation Accuracy: 0.518
Validation AUROC: 0.512832


In [12]:
#### Code for Testing
# Path of the saved model
model_path = "vit_final_20.pth"  # modify this to your actual path

# Initialize the same model architecture
model = AutoModel.from_pretrained("facebook/dino-vits8")
model.classifier = nn.Linear(model.config.hidden_size, num_classes)
model = model.to(device)

# Load the saved model parameters
model.load_state_dict(torch.load(model_path))

df_image_test = pd.read_json("/kaggle/input/dev-seen-memes/dev_seen.jsonl", lines=True)
X_test_images = load_image_paths(df_image_test)
y_test = df_image_test['label']

# Create the test dataset and dataloader, assuming that you have already loaded the test data
# into X_test_images and y_test
test_dataset = ImageDataset(X_test_images, y_test, transforms=transform)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Testing loop
# Testing loop
model.eval()  # Set the model to evaluation mode
all_predictions_vit = []
with torch.no_grad():  # Do not calculate gradients (saves memory and computation)
    all_labels = []
    all_predictions = []
    all_probabilities = []

    for images, labels in test_dataloader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        logits = model.classifier(outputs.pooler_output)

        probabilities = nn.functional.softmax(logits, dim=1)  # probabilities
        _, predicted = torch.max(logits, 1)  # predicted class
        
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())
        all_probabilities.extend(probabilities[:, 1].cpu().numpy())  # probabilities of class 1

    precision = precision_score(all_labels, all_predictions, average='macro')
    recall = recall_score(all_labels, all_predictions, average='macro')
    accuracy = accuracy_score(all_labels, all_predictions)
    roc_auc = roc_auc_score(all_labels, all_probabilities)  # calculate roc_auc

    print(f"Test Precision: {precision}")
    print(f"Test Recall: {recall}")
    print(f"Test Accuracy: {accuracy}")
    print(f"Test AUROC: {roc_auc}")
    all_predictions_vit = all_predictions

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vits8 and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Precision: 0.5209017248904153
Test Recall: 0.517626538221504
Test Accuracy: 0.52
Test AUROC: 0.5114016418364244
