# Project Overview

This notebook implements an advanced benchmark of four distinct Vision Language Models (VLMs) on the Hateful Memes Challenge Dataset (HMCD). This version loads the dataset from local files, uses a balanced dataset sample, and evaluates models from four different families to capture classification.

In [1]:
import ollama
import pandas as pd
import os

## Step 1: Load Dataset from Local Files & Prepare Sample

In [2]:
DATASET_FOLDER = 'data' 
ANNOTATION_FILE = os.path.join(DATASET_FOLDER, 'dev.jsonl')
IMG_DIR = os.path.join(DATASET_FOLDER, 'img')

In [3]:
if not os.path.exists(ANNOTATION_FILE):
    print(f"Error'")
else:
    df = pd.read_json(ANNOTATION_FILE, lines=True)
    # Create the full path to each image file
    df['img_path'] = df['img'].apply(lambda x: os.path.join(IMG_DIR, x))
    print("Local dataset loaded successfully.")


Local dataset loaded successfully.


### Balanced Sampling Strategy

In [4]:
df['label_text'] = df['label'].map({0: 'non-hateful', 1: 'hateful'})

In [5]:
hateful_df = df[df['label'] == 1].sample(n=250, random_state=42)

In [6]:
non_hateful_df = df[df['label'] == 0].sample(n=250, random_state=42)

In [7]:
df_sample = pd.concat([hateful_df, non_hateful_df])
df_sample = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
df_sample.head()

Unnamed: 0,id,img,label,text,img_path,label_text
0,93541,img/93541.png,0,the jihad squad,data/img/img/93541.png,non-hateful
1,83946,img/83946.png,1,when a white kid shoots up the school the same...,data/img/img/83946.png,hateful
2,62703,img/62703.png,0,they shot the wrong gorilla,data/img/img/62703.png,non-hateful
3,85761,img/85761.png,1,no wonder why the pipes are not working,data/img/img/85761.png,hateful
4,39578,img/39578.png,1,one of the bullets they dug out of a victim fr...,data/img/img/39578.png,hateful


In [9]:
print(f"Created a balanced test set with {len(df_sample)} samples.")
print("Label distribution:")
print(df_sample['label_text'].value_counts())

Created a balanced test set with 500 samples.
Label distribution:
label_text
non-hateful    250
hateful        250
Name: count, dtype: int64


## Step 2: Playing with the Prompt

In [10]:
def create_zero_shot_prompt():
    """Asks for a direct classification."""
    return (
          "You are an expert content moderation analyst. Your task is to determine if the provided meme is 'hateful' or 'non-hateful'. "
          "After your classification, you MUST provide a single sentence justification for your decision. "
          "Format your response as: 'CLASSIFICATION. Justification: YOUR_REASON_HERE.'"
          "\n\nExample: 'hateful. Justification: The text uses a derogatory slur against a protected group.'"
          "\nExample: 'non-hateful. Justification: The meme is a harmless joke about animals and does not target any group.'"

    )

In [11]:
def create_cot_prompt():
    """Guide the model to reason step-by-step before classifying."""
    return (
        "Perform the following steps:\n"
        "1. Briefly describe the visual elements in the image.\n"
        "2. Analyze the text on the meme.\n"
        "3. Consider the combined meaning of the visual elements and the text.\n\n"
        "Based on your step-by-step analysis, classify the meme as 'hateful' or 'non-hateful'. "
        "Your final response must be only the single word classification."
    )


In [12]:
prompt_template = create_zero_shot_prompt()
# prompt_template = create_cot_prompt()

In [14]:
def parse_response(response_text):
    cleaned_text = response_text.lower().strip()

    classification = 'error'
    justification = ''

    if 'justification:' in cleaned_text:
        try:
            justification = cleaned_text.split('justification:')[1].strip()
        except IndexError:
            justification = "Could not parse justification."

    if cleaned_text.startswith('non-hateful'):
        classification = 'non-hateful'
    elif cleaned_text.startswith('hateful'):
        classification = 'hateful'

    return classification. justification

In [15]:
def classify_with_ollama(model_name, image_path, prompt):
    try:
        response = ollama.chat(model=model_name, messages=[{'role': 'user', 'content': prompt, 'images': [image_path]}])
        return parse_response(response['message']['content'])
    except Exception as e:
        return 'error'

In [None]:
# import base64

In [None]:
# def image_to_base64(image_path):
#     with open(image_path, "rb") as image_file:
#         return base64.b64encode(image_file.read()).decode('utf-8')

In [67]:
from tqdm import tqdm

In [None]:
models_to_test = {
    'llava:7b': classify_with_ollama
}
results_data = {model: [] for model in models_to_test}

for model_name, classification_func in models_to_test.items():
    print(f"\n--- Benchmarking model: {model_name} ---")
    for index, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc=f"Processing {model_name}"):
        if 'ollama' in classification_func.__name__:
            pred, just = classification_func(model_name, row['img_path'], prompt_template)
        else:
            pred = classification_func(row['img_path'], prompt_template) ## for gemini or claude. will use later
        results_data[model_name].append({'prediction': pred, 'justification': just})

for model_name, data in results_data.items():
    df_sample[f'prediction_{model_name}'] = [item['prediction'] for item in data]
    df_sample[f'justification_{model_name}'] = [item['justification'] for item in data]

print("\n--- Benchmark Complete! ---")
display(df_sample[['id', 'label_text'] + [f'prediction_{model}' for model in models_to_test.keys()]].head())




--- Benchmarking model: llava:7b ---


Processing llava:7b: 100%|██████████| 250/250 [46:59<00:00, 11.28s/it]


--- Benchmark Complete! ---





Unnamed: 0,id,label_text,prediction_llava:7b
0,49360,non-hateful,non-hateful
1,7198,hateful,hateful
2,92738,hateful,error
3,50261,hateful,error
4,43175,hateful,non-hateful


In [69]:
ground_truth = df_sample['label_text']


In [70]:
from sklearn.metrics import classification_report

for model_name in models_to_test.keys():
    print(f"\n--- Evaluation Report for: {model_name} ---")
    model_predictions = df_sample[f'prediction_{model_name}']
    report = classification_report(ground_truth, model_predictions, labels=['hateful', 'non-hateful'], zero_division=0)
    print(report)


--- Evaluation Report for: llava:7b ---
              precision    recall  f1-score   support

     hateful       0.49      0.45      0.47       125
 non-hateful       0.51      0.38      0.44       125

   micro avg       0.50      0.42      0.45       250
   macro avg       0.50      0.42      0.45       250
weighted avg       0.50      0.42      0.45       250



In [71]:
columns_to_show = ['label_text'] + [col for col in df_sample.columns if 'prediction' in col]
full_results_df = df_sample[columns_to_show]
display(full_results_df)


Unnamed: 0,label_text,prediction_llava:7b
0,non-hateful,non-hateful
1,hateful,hateful
2,hateful,error
3,hateful,error
4,hateful,non-hateful
...,...,...
245,hateful,non-hateful
246,hateful,non-hateful
247,hateful,non-hateful
248,non-hateful,non-hateful


In [72]:
full_results_df.to_csv('full_benchmark_results.csv', index=False)