# Project Overview

This notebook implements an advanced benchmark of four distinct Vision Language Models (VLMs) on the Hateful Memes Challenge Dataset (HMCD). This version loads the dataset from local files, uses a balanced dataset sample, and evaluates models from four different families to capture classification.

In [192]:
import ollama
import pandas as pd
import os

## Step 1: Load Dataset from Local Files & Prepare Sample

In [193]:
DATASET_FOLDER = 'data' 
ANNOTATION_FILE = os.path.join(DATASET_FOLDER, 'dev.jsonl')
IMG_DIR = os.path.join(DATASET_FOLDER, 'img')

In [194]:
if not os.path.exists(ANNOTATION_FILE):
    print(f"Error'")
else:
    df = pd.read_json(ANNOTATION_FILE, lines=True)
    # Create the full path to each image file
    df['img_path'] = df['img'].apply(lambda x: os.path.join(IMG_DIR, x))
    print("Local dataset loaded successfully.")


Local dataset loaded successfully.


### Balanced Sampling Strategy

In [195]:
df['label_text'] = df['label'].map({0: 'non-hateful', 1: 'hateful'})

In [196]:
hateful_df = df[df['label'] == 1].sample(n=250, random_state=70)

In [197]:
non_hateful_df = df[df['label'] == 0].sample(n=250, random_state=70)

In [198]:
df_sample = pd.concat([hateful_df, non_hateful_df])
df_sample = df_sample.sample(frac=1, random_state=50).reset_index(drop=True)

In [199]:
df_sample.head()

Unnamed: 0,id,img,label,text,img_path,label_text
0,16704,img/16704.png,0,"last summer camp was so ""lit""",data/img/img/16704.png,non-hateful
1,17950,img/17950.png,0,"i was excoriated for a harmless ""tweet"" and ""m...",data/img/img/17950.png,non-hateful
2,37814,img/37814.png,0,this is el paso shooter's father every. single...,data/img/img/37814.png,non-hateful
3,19243,img/19243.png,0,"surprise!! wait, where is everyone?",data/img/img/19243.png,non-hateful
4,2145,img/02145.png,0,"and then i asked mom, what's for dinner?",data/img/img/02145.png,non-hateful


In [200]:
print(f"Created a balanced test set with {len(df_sample)} samples.")
print("Label distribution:")
print(df_sample['label_text'].value_counts())

Created a balanced test set with 500 samples.
Label distribution:
label_text
non-hateful    250
hateful        250
Name: count, dtype: int64


## Step 2: Playing with the Prompt

In [201]:
def create_zero_shot_prompt():
    """Asks for a direct classification."""
    return (
    "You are a content moderation AI. Your task is to classify a meme as either 'hateful' or 'non-hateful'. "
    "Analyze both the image and the accompanying text.\n"
    "Respond in *exactly* this format (no explanation before or after):\n"
    "Classification: hateful or non-hateful\n"
    "Justification: <your short reason>\n"
    "Example:\n"
    "Classification: non-hateful\n"
    "Justification: The meme contains no offensive content and appears neutral.\n"
)


In [202]:
def create_cot_prompt():
    """Guide the model to reason step-by-step before classifying."""
    return (
        "Perform the following steps:\n"
        "1. Briefly describe the visual elements in the image.\n"
        "2. Analyze the text on the meme.\n"
        "3. Consider the combined meaning of the visual elements and the text.\n\n"
        "Based on your step-by-step analysis, classify the meme as 'hateful' or 'non-hateful'. "
        "Your final response must be only the single word classification."
    )


In [203]:
prompt_template = create_zero_shot_prompt()
# prompt_template = create_cot_prompt()

In [204]:
# def parse_response(response_text):
#     cleaned_text = response_text.lower().strip()

#     classification = 'error'
#     justification = '' ''## will modify this later. Adding for debugging

#     if 'justification:' in cleaned_text:
#         try:
#             parts = cleaned_text.split('justification:', 1)
#             classification_part = parts[0]
#             justification = parts[1].strip()
#         except IndexError:
#             classification_part = cleaned_text
#             justification = "Could not parse justification part."
#     else:
#         classification_part = cleaned_text

#     if 'non-hateful' in classification_part:
#         classification = 'non-hateful'
#     elif 'hateful' in classification_part:
#         classification = 'hateful'
        
#     return classification, justification

In [None]:
## Parse function v3
import re

def parse_response(response_text):
    """
    A robust function to extract classification and justification using regular expressions.
    """
    cleaned_text = response_text.lower().strip()

    classification = 'error'
    justification = ''

    classification_match = re.search(r"classification:\s*(hateful|non-hateful)", cleaned_text) 
    
    if classification_match:
        classification = classification_match.group(1).strip()
    else:
        if 'non-hateful' in cleaned_text:
            classification = 'non-hateful'
        elif 'hateful' in cleaned_text:
            classification = 'hateful'

    justification_match = re.search(r"justification:\s*(.*)", cleaned_text, re.DOTALL)
    if justification_match:
        justification = justification_match.group(1).strip()
    elif not justification and classification != 'error':
        justification = "Justification not provided in response."
    else:
        justification = cleaned_text # For 'error' cases, store the raw output for debugging

    return classification, justification



In [206]:
# def parse_response(response_text): v2
#     cleaned_text = response_text.lower().strip()

#     classification = 'error'
#     justification = ''

#     # Try to extract classification line
#     if 'classification:' in cleaned_text:
#         try:
#             classification_line = cleaned_text.split('classification:')[1].split('\n')[0].strip()
#             if 'non-hateful' in classification_line:
#                 classification = 'non-hateful'
#             elif 'hateful' in classification_line:
#                 classification = 'hateful'
#         except Exception:
#             classification = 'error'

#     # Fallback: try first word
#     # if classification == 'error':
#     #     if cleaned_text.startswith('non-hateful'):
#     #         classification = 'non-hateful'
#     #     elif cleaned_text.startswith('hateful'):
#     #         classification = 'hateful'

#     # Extract justification
#     if 'justification:' in cleaned_text:
#         try:
#             justification = cleaned_text.split('justification:')[1].strip()
#         except IndexError:
#             justification = 'Could not parse justification.'

#     return classification, justification


In [207]:
# def classify_with_ollama(model_name, image_path, prompt):
#     try:
#         response = ollama.chat(
#             model=model_name,
#             messages=[{'role': 'user', 'content': prompt, 'images': [image_path]}]
#         )
#         content = response['message']['content']
#         pred = parse_response(content)
#         return pred, content
#     except Exception as e:
#         return 'error', 'error'


In [208]:
def classify_with_ollama(model_name, image_path, prompt):
    try:
        response = ollama.chat(model=model_name, messages=[{'role': 'user', 'content': prompt, 'images': [image_path]}])
        return parse_response(response['message']['content'])
    except Exception as e:
        return 'error', f"Ollama API Error: {str(e)}"

In [209]:
# import base64

In [210]:
# def image_to_base64(image_path):
#     with open(image_path, "rb") as image_file:
#         return base64.b64encode(image_file.read()).decode('utf-8')

In [211]:
from tqdm import tqdm

In [212]:
models_to_test = {
    'llava:7b': classify_with_ollama
}
results_data = {model: [] for model in models_to_test}

for model_name, classification_func in models_to_test.items():
    print(f"\n--- Benchmarking model: {model_name} ---")
    predictions = []
    justifications = []
    for index, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc=f"Processing {model_name}"):
        pred, just = classification_func(model_name, row['img_path'], prompt_template)
        predictions.append(pred)
        justifications.append(just)

    df_sample[f'prediction_{model_name}'] = predictions
    df_sample[f'justification_{model_name}'] = justifications

# for model_name, data in results_data.items():
#     df_sample[f'prediction_{model_name}'] = [item['prediction'] for item in data]
#     df_sample[f'justification_{model_name}'] = [item['justification'] for item in data]

#     df_sample['prediction_{model_name}'] = df_sample['prediction_{model_name}'].apply(lambda x: x[0] if isinstance(x, tuple) else x)
#     df_sample['justification_{model_name}'] = df_sample['justification_{model_name}'].apply(lambda x: x[1] if isinstance(x, tuple) else x)


print("\n--- Benchmark Complete! ---")
# display(df_sample[['id', 'label_text'] + [f'prediction_{model}' for model in models_to_test.keys()]].head())

display(df_sample.head())



--- Benchmarking model: llava:7b ---


Processing llava:7b: 100%|██████████| 500/500 [55:13<00:00,  6.63s/it]


--- Benchmark Complete! ---





Unnamed: 0,id,img,label,text,img_path,label_text,prediction_llava:7b,justification_llava:7b
0,16704,img/16704.png,0,"last summer camp was so ""lit""",data/img/img/16704.png,non-hateful,non-hateful,"the image shows people at a construction site,..."
1,17950,img/17950.png,0,"i was excoriated for a harmless ""tweet"" and ""m...",data/img/img/17950.png,non-hateful,non-hateful,the meme does not contain any offensive langua...
2,37814,img/37814.png,0,this is el paso shooter's father every. single...,data/img/img/37814.png,non-hateful,non-hateful,the meme is a humorous depiction of a characte...
3,19243,img/19243.png,0,"surprise!! wait, where is everyone?",data/img/img/19243.png,non-hateful,non-hateful,"the meme contains a woman in a room, surrounde..."
4,2145,img/02145.png,0,"and then i asked mom, what's for dinner?",data/img/img/02145.png,non-hateful,hateful,the meme includes text that expresses a desire...


In [213]:
ground_truth = df_sample['label_text']


In [214]:
from sklearn.metrics import classification_report

for model_name in models_to_test.keys():
    print(f"\n--- Evaluation Report for: {model_name} ---")
    model_predictions = df_sample[f'prediction_{model_name}']
    model_predictions = model_predictions.apply(lambda x: x[0] if isinstance(x, tuple) else x)

    report = classification_report(
        ground_truth, 
        model_predictions, 
        labels=['hateful', 'non-hateful'], 
        zero_division=0)
    print(report)


--- Evaluation Report for: llava:7b ---
              precision    recall  f1-score   support

     hateful       0.51      0.40      0.45       250
 non-hateful       0.50      0.61      0.55       250

    accuracy                           0.51       500
   macro avg       0.51      0.51      0.50       500
weighted avg       0.51      0.51      0.50       500



In [None]:
# def analyze_errors_with_justification(model_name):
#     error_df = df_sample[df_sample['label_text'] != df_sample[f'prediction_{model_name}']]
#     print(f"\n--- Error Analysis for {model_name} ---")
#     print(f"Found {len(error_df)} errors out of {len(df_sample)} samples.")

In [216]:
columns_to_keep = ['id', 'text', 'label_text']
for model_name in models_to_test.keys():
    columns_to_keep.append(f'prediction_{model_name}')
    columns_to_keep.append(f'justification_{model_name}')

In [217]:
full_results_df = df_sample[columns_to_keep]

In [218]:
full_results_df.to_csv('full_benchmark_results with justification.csv', index=False)