# Project Overview

This notebook implements an advanced benchmark of four distinct Vision Language Models (VLMs) on the Hateful Memes Challenge Dataset (HMCD). This version loads the dataset from local files, uses a balanced dataset sample, and evaluates models from four different families to capture classification.

In [51]:
import ollama
import pandas as pd
import os
from PIL import Image
import matplotlib.pyplot as plt


## Step 1: Load Dataset from Local Files & Prepare Sample

In [52]:
DATASET_FOLDER = 'data' 
ANNOTATION_FILE = os.path.join(DATASET_FOLDER, 'dev.jsonl')
IMG_DIR = os.path.join(DATASET_FOLDER, 'img')

In [53]:
if not os.path.exists(ANNOTATION_FILE):
    print(f"Error'")
else:
    df = pd.read_json(ANNOTATION_FILE, lines=True)
    # Create the full path to each image file
    df['img_path'] = df['img'].apply(lambda x: os.path.join(IMG_DIR, x))
    print("Local dataset loaded successfully.")


Local dataset loaded successfully.


### Balanced Sampling Strategy

In [54]:
df['label_text'] = df['label'].map({0: 'non-hateful', 1: 'hateful'})

In [55]:
hateful_df = df[df['label'] == 1].sample(n=125, random_state=42)

In [56]:
non_hateful_df = df[df['label'] == 0].sample(n=125, random_state=42)

In [57]:
df_sample = pd.concat([hateful_df, non_hateful_df])
df_sample = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)

In [58]:
df_sample.head()

Unnamed: 0,id,img,label,text,img_path,label_text
0,49360,img/49360.png,0,who wants ice cream me me me,data/img/img/49360.png,non-hateful
1,7198,img/07198.png,1,islam is a religion of peace stop criticizing ...,data/img/img/07198.png,hateful
2,92738,img/92738.png,1,they are the best in basketball because they c...,data/img/img/92738.png,hateful
3,50261,img/50261.png,1,haters will say it's photoshop,data/img/img/50261.png,hateful
4,43175,img/43175.png,1,when was the last time you saw so many mentall...,data/img/img/43175.png,hateful


In [59]:
print(f"Created a balanced test set with {len(df_sample)} samples.")
print("Label distribution:")
print(df_sample['label_text'].value_counts())

Created a balanced test set with 250 samples.
Label distribution:
label_text
non-hateful    125
hateful        125
Name: count, dtype: int64


## Step 2: Playing with the Prompt

In [60]:
def create_zero_shot_prompt():
    """Asks for a direct classification."""
    return (
        "You are an expert content moderation analyst. Classify the provided meme as either 'hateful' or 'non-hateful'.\n"
        "Respond with only a single word: `hateful` or `non-hateful`."
    )

In [61]:
def create_cot_prompt():
    """Guide the model to reason step-by-step before classifying."""
    return (
        "Perform the following steps:\n"
        "1. Briefly describe the visual elements in the image.\n"
        "2. Analyze the text on the meme.\n"
        "3. Consider the combined meaning of the visual elements and the text.\n\n"
        "Based on your step-by-step analysis, classify the meme as 'hateful' or 'non-hateful'. "
        "Your final response must be only the single word classification."
    )


In [62]:
prompt_template = create_zero_shot_prompt()
# prompt_template = create_cot_prompt()

In [63]:
def parse_response(response_text):
    """Extracts the single-word classification from the model's response."""
    cleaned_text = response_text.lower().strip().replace('.', '')
    if 'non-hateful' in cleaned_text:
        return 'non-hateful'
    elif 'hateful' in cleaned_text:
        return 'hateful'
    else:
        return 'error'

In [64]:
def classify_with_ollama(model_name, image_path, prompt):
    try:
        response = ollama.chat(model=model_name, messages=[{'role': 'user', 'content': prompt, 'images': [image_path]}])
        return parse_response(response['message']['content'])
    except Exception as e:
        return 'error'

In [65]:
import base64

In [66]:
def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [67]:
from tqdm import tqdm

In [68]:
models_to_test = {
    'llava:7b': classify_with_ollama
}
results_data = {model: [] for model in models_to_test}

for model_name, classification_func in models_to_test.items():
    print(f"\n--- Benchmarking model: {model_name} ---")
    predictions = []
    for index, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc=f"Processing {model_name}"):
        if 'ollama' in classification_func.__name__:
            pred = classification_func(model_name, row['img_path'], prompt_template)
        else:
            pred = classification_func(row['img_path'], prompt_template)
        predictions.append(pred)
    df_sample[f'prediction_{model_name}'] = predictions

print("\n--- Benchmark Complete! ---")
display(df_sample[['id', 'label_text'] + [f'prediction_{model}' for model in models_to_test.keys()]].head())




--- Benchmarking model: llava:7b ---


Processing llava:7b: 100%|██████████| 250/250 [46:59<00:00, 11.28s/it]


--- Benchmark Complete! ---





Unnamed: 0,id,label_text,prediction_llava:7b
0,49360,non-hateful,non-hateful
1,7198,hateful,hateful
2,92738,hateful,error
3,50261,hateful,error
4,43175,hateful,non-hateful


In [69]:
ground_truth = df_sample['label_text']


In [70]:
from sklearn.metrics import classification_report

for model_name in models_to_test.keys():
    print(f"\n--- Evaluation Report for: {model_name} ---")
    model_predictions = df_sample[f'prediction_{model_name}']
    report = classification_report(ground_truth, model_predictions, labels=['hateful', 'non-hateful'], zero_division=0)
    print(report)


--- Evaluation Report for: llava:7b ---
              precision    recall  f1-score   support

     hateful       0.49      0.45      0.47       125
 non-hateful       0.51      0.38      0.44       125

   micro avg       0.50      0.42      0.45       250
   macro avg       0.50      0.42      0.45       250
weighted avg       0.50      0.42      0.45       250



In [71]:
columns_to_show = ['label_text'] + [col for col in df_sample.columns if 'prediction' in col]
full_results_df = df_sample[columns_to_show]
display(full_results_df)


Unnamed: 0,label_text,prediction_llava:7b
0,non-hateful,non-hateful
1,hateful,hateful
2,hateful,error
3,hateful,error
4,hateful,non-hateful
...,...,...
245,hateful,non-hateful
246,hateful,non-hateful
247,hateful,non-hateful
248,non-hateful,non-hateful


In [72]:
full_results_df.to_csv('full_benchmark_results.csv', index=False)