# Project Overview

This notebook implements an advanced benchmark of four distinct Vision Language Models (VLMs) on the Hateful Memes Challenge Dataset (HMCD). This version loads the dataset from local files, uses a balanced dataset sample, and evaluates models from four different families to capture classification.

In [1]:
import ollama
import pandas as pd
import os

## Step 1: Load Dataset from Local Files & Prepare Sample

In [2]:
DATASET_FOLDER = 'data' 
ANNOTATION_FILE = os.path.join(DATASET_FOLDER, 'dev.jsonl')
IMG_DIR = os.path.join(DATASET_FOLDER, 'img')

In [3]:
if not os.path.exists(ANNOTATION_FILE):
    print(f"Error'")
else:
    df = pd.read_json(ANNOTATION_FILE, lines=True)
    # Create the full path to each image file
    df['img_path'] = df['img'].apply(lambda x: os.path.join(IMG_DIR, x))
    print("Local dataset loaded successfully.")


Local dataset loaded successfully.


### Balanced Sampling Strategy

In [4]:
df['label_text'] = df['label'].map({0: 'non-hateful', 1: 'hateful'})

In [5]:
hateful_df = df[df['label'] == 1].sample(n=250, random_state=42)

In [6]:
non_hateful_df = df[df['label'] == 0].sample(n=250, random_state=42)

In [7]:
df_sample = pd.concat([hateful_df, non_hateful_df])
df_sample = df_sample.sample(frac=1, random_state=50).reset_index(drop=True)

In [8]:
df_sample.head()

Unnamed: 0,id,img,label,text,img_path,label_text
0,39018,img/39018.png,0,"she said ""i want a ring"" i said ""bitch take yo...",data/img/img/39018.png,non-hateful
1,62703,img/62703.png,0,they shot the wrong gorilla,data/img/img/62703.png,non-hateful
2,54129,img/54129.png,0,dump pipe to the ocean,data/img/img/54129.png,non-hateful
3,1742,img/01742.png,0,in just one hour from now i'll only have 4 hou...,data/img/img/01742.png,non-hateful
4,85237,img/85237.png,0,these are all the weapons that were seized fro...,data/img/img/85237.png,non-hateful


In [9]:
print(f"Created a balanced test set with {len(df_sample)} samples.")
print("Label distribution:")
print(df_sample['label_text'].value_counts())

Created a balanced test set with 500 samples.
Label distribution:
label_text
non-hateful    250
hateful        250
Name: count, dtype: int64


## Step 2: Playing with the Prompt

In [10]:
def create_zero_shot_prompt():
    """Asks for a direct classification."""
    return (
    "You are a content moderation AI. Your task is to classify a meme as either 'hateful' or 'non-hateful'. "
    "Analyze both the image and the accompanying text.\n"
    "Respond in *exactly* this format (no explanation before or after):\n"
    "Classification: hateful or non-hateful\n"
    "Justification: <your short reason>\n"
    "Example:\n"
    "Classification: non-hateful\n"
    "Justification: The meme contains no offensive content and appears neutral.\n"
)


In [11]:
def create_cot_prompt():
    """Guide the model to reason step-by-step before classifying."""
    return (
        "Perform the following steps:\n"
        "1. Briefly describe the visual elements in the image.\n"
        "2. Analyze the text on the meme.\n"
        "3. Consider the combined meaning of the visual elements and the text.\n\n"
        "Based on your step-by-step analysis, classify the meme as 'hateful' or 'non-hateful'. "
        "Your final response must be only the single word classification."
    )


In [12]:
prompt_template = create_zero_shot_prompt()
# prompt_template = create_cot_prompt()

In [13]:
# def parse_response(response_text):
#     cleaned_text = response_text.lower().strip()

#     classification = 'error'
#     justification = '' ''## will modify this later. Adding for debugging

#     if 'justification:' in cleaned_text:
#         try:
#             parts = cleaned_text.split('justification:', 1)
#             classification_part = parts[0]
#             justification = parts[1].strip()
#         except IndexError:
#             classification_part = cleaned_text
#             justification = "Could not parse justification part."
#     else:
#         classification_part = cleaned_text

#     if 'non-hateful' in classification_part:
#         classification = 'non-hateful'
#     elif 'hateful' in classification_part:
#         classification = 'hateful'
        
#     return classification, justification

In [14]:
## Parse function v3
import re

def parse_response(response_text):
    """
    A robust function to extract classification and justification using regular expressions.
    """
    cleaned_text = response_text.lower().strip()

    classification = 'error'
    justification = ''

    classification_match = re.search(r"classification:\s*(hateful|non-hateful)", cleaned_text) 
    
    if classification_match:
        classification = classification_match.group(1).strip()
    else:
        if 'non-hateful' in cleaned_text:
            classification = 'non-hateful'
        elif 'hateful' in cleaned_text:
            classification = 'hateful'

    justification_match = re.search(r"justification:\s*(.*)", cleaned_text, re.DOTALL)
    if justification_match:
        justification = justification_match.group(1).strip()
    elif not justification and classification != 'error':
        justification = "Justification not provided in response."
    else:
        justification = cleaned_text # For 'error' cases, store the raw output for debugging

    return classification, justification



In [15]:
# def parse_response(response_text): v2
#     cleaned_text = response_text.lower().strip()

#     classification = 'error'
#     justification = ''

#     # Try to extract classification line
#     if 'classification:' in cleaned_text:
#         try:
#             classification_line = cleaned_text.split('classification:')[1].split('\n')[0].strip()
#             if 'non-hateful' in classification_line:
#                 classification = 'non-hateful'
#             elif 'hateful' in classification_line:
#                 classification = 'hateful'
#         except Exception:
#             classification = 'error'

#     # Fallback: try first word
#     # if classification == 'error':
#     #     if cleaned_text.startswith('non-hateful'):
#     #         classification = 'non-hateful'
#     #     elif cleaned_text.startswith('hateful'):
#     #         classification = 'hateful'

#     # Extract justification
#     if 'justification:' in cleaned_text:
#         try:
#             justification = cleaned_text.split('justification:')[1].strip()
#         except IndexError:
#             justification = 'Could not parse justification.'

#     return classification, justification


In [16]:
# def classify_with_ollama(model_name, image_path, prompt):
#     try:
#         response = ollama.chat(
#             model=model_name,
#             messages=[{'role': 'user', 'content': prompt, 'images': [image_path]}]
#         )
#         content = response['message']['content']
#         pred = parse_response(content)
#         return pred, content
#     except Exception as e:
#         return 'error', 'error'


In [17]:
def classify_with_ollama(model_name, image_path, prompt):
    try:
        response = ollama.chat(model=model_name, messages=[{'role': 'user', 'content': prompt, 'images': [image_path]}])
        return parse_response(response['message']['content'])
    except Exception as e:
        return 'error', f"Ollama API Error: {str(e)}"

In [18]:
# import base64

In [19]:
# def image_to_base64(image_path):
#     with open(image_path, "rb") as image_file:
#         return base64.b64encode(image_file.read()).decode('utf-8')

In [20]:
from tqdm import tqdm

In [21]:
models_to_test = {
    'qwen2.5vl:7b': classify_with_ollama
}
results_data = {model: [] for model in models_to_test}

for model_name, classification_func in models_to_test.items():
    print(f"\n--- Benchmarking model: {model_name} ---")
    predictions = []
    justifications = []
    for index, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc=f"Processing {model_name}"):
        pred, just = classification_func(model_name, row['img_path'], prompt_template)
        predictions.append(pred)
        justifications.append(just)

    df_sample[f'prediction_{model_name}'] = predictions
    df_sample[f'justification_{model_name}'] = justifications

# for model_name, data in results_data.items():
#     df_sample[f'prediction_{model_name}'] = [item['prediction'] for item in data]
#     df_sample[f'justification_{model_name}'] = [item['justification'] for item in data]

#     df_sample['prediction_{model_name}'] = df_sample['prediction_{model_name}'].apply(lambda x: x[0] if isinstance(x, tuple) else x)
#     df_sample['justification_{model_name}'] = df_sample['justification_{model_name}'].apply(lambda x: x[1] if isinstance(x, tuple) else x)


print("\n--- Benchmark Complete! ---")
# display(df_sample[['id', 'label_text'] + [f'prediction_{model}' for model in models_to_test.keys()]].head())

display(df_sample.head())



--- Benchmarking model: qwen2.5vl:7b ---


Processing qwen2.5vl:7b: 100%|██████████| 500/500 [2:01:39<00:00, 14.60s/it]  


--- Benchmark Complete! ---





Unnamed: 0,id,img,label,text,img_path,label_text,prediction_qwen2.5vl:7b,justification_qwen2.5vl:7b
0,39018,img/39018.png,0,"she said ""i want a ring"" i said ""bitch take yo...",data/img/img/39018.png,non-hateful,non-hateful,the meme contains no offensive content and app...
1,62703,img/62703.png,0,they shot the wrong gorilla,data/img/img/62703.png,non-hateful,non-hateful,the meme is a tribute to a gorilla and does no...
2,54129,img/54129.png,0,dump pipe to the ocean,data/img/img/54129.png,non-hateful,non-hateful,the image and text do not contain any offensiv...
3,1742,img/01742.png,0,in just one hour from now i'll only have 4 hou...,data/img/img/01742.png,non-hateful,non-hateful,the meme contains no offensive content and app...
4,85237,img/85237.png,0,these are all the weapons that were seized fro...,data/img/img/85237.png,non-hateful,non-hateful,the meme depicts a collection of weapons seize...


In [22]:
ground_truth = df_sample['label_text']


In [23]:
from sklearn.metrics import classification_report

for model_name in models_to_test.keys():
    print(f"\n--- Evaluation Report for: {model_name} ---")
    model_predictions = df_sample[f'prediction_{model_name}']
    model_predictions = model_predictions.apply(lambda x: x[0] if isinstance(x, tuple) else x)

    report = classification_report(
        ground_truth, 
        model_predictions, 
        labels=['hateful', 'non-hateful'], 
        zero_division=0)
    print(report)


--- Evaluation Report for: qwen2.5vl:7b ---
              precision    recall  f1-score   support

     hateful       0.70      0.34      0.45       250
 non-hateful       0.56      0.86      0.68       250

    accuracy                           0.60       500
   macro avg       0.63      0.60      0.57       500
weighted avg       0.63      0.60      0.57       500



In [24]:
# def analyze_errors_with_justification(model_name):
#     error_df = df_sample[df_sample['label_text'] != df_sample[f'prediction_{model_name}']]
#     print(f"\n--- Error Analysis for {model_name} ---")
#     print(f"Found {len(error_df)} errors out of {len(df_sample)} samples.")

In [25]:
columns_to_keep = ['id', 'text', 'label_text']
for model_name in models_to_test.keys():
    columns_to_keep.append(f'prediction_{model_name}')
    columns_to_keep.append(f'justification_{model_name}')

In [26]:
full_results_df = df_sample[columns_to_keep]

In [27]:
full_results_df.to_csv('full_benchmark_results with justification qwen.csv', index=False)