In [None]:
import numpy as np
import pandas as pd
import sys
from openai import OpenAI
import os
import cv2
import re
import glob
from pathlib import Path
import matplotlib.pyplot as plt
import base64
from tqdm import tqdm
from dotenv import load_dotenv

In [None]:
df = pd.read_csv('Data/New230SHEETS.csv')
df

In [None]:
# Load prompt template
with open('Prompt/filter.md', 'r', encoding='utf-8') as f:
    prompt_template = f.read()

In [None]:
# Test with first meme
test_row = df.iloc[0]
image_text = test_row['ocr_txt']
image_name = test_row['image_name']

# Replace placeholders
final_prompt = prompt_template.replace('{image_text}', str(image_text))
final_prompt = final_prompt.replace('{image_path}', image_name)

# Show the final prompt
print("FINAL PROMPT THAT WILL BE SENT TO OPENAI:")
print("="*60)
print(final_prompt)
print("="*60)

In [None]:
def encode_image_to_base64(image_path):
    """Encode image to base64"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [None]:
# Load environment variables
load_dotenv()

# Initialize OpenAI client with API key from environment
client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY')
)

# Verify API key is loaded
if not os.getenv('OPENAI_API_KEY'):
    raise ValueError("OPENAI_API_KEY not found in environment variables. Please create a .env file with your API key.")

def classify_meme(row):
    """Classify a single meme using OpenAI Vision API"""
    # Get image path and text from your CSV structure
    image_path = f"Data/Filtered 230/{row['image_name']}"
    image_text = row['ocr_txt']
    
    # Encode image
    base64_image = encode_image_to_base64(image_path)
    
    # Create prompt by replacing placeholders
    prompt = prompt_template.replace('{image_text}', str(image_text))
    prompt = prompt.replace('{image_path}', row['image_name'])
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                                "detail": "high"
                            }
                        }
                    ]
                }
            ],
            max_tokens=10,
            temperature=0
        )
        
        result = response.choices[0].message.content.strip()
        
        # Parse result
        if "Unsafe" in result:
            classification = "Unsafe"
        elif "Safe" in result:
            classification = "Safe"
        else:
            classification = "Unclear"
            
        return {
            'classification': classification,
            'full_response': result,
            'tokens_used': response.usage.total_tokens
        }
        
    except Exception as e:
        return {
            'classification': 'Error',
            'full_response': str(e),
            'tokens_used': 0
        }

In [None]:
# Process all memes
results = []
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing memes"):
    result = classify_meme(row)
    results.append({
        'id': row['id'],
        'image_name': row['image_name'],
        'ocr_txt': row['ocr_txt'],
        'classification': result['classification'],
        'full_response': result['full_response'],
        'tokens_used': result['tokens_used']
    })
    
    # Optional: Add delay to respect rate limits
    import time
    time.sleep(1)

results

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)

# Save results
results_df.to_csv('meme_classification_results.csv', index=False)

# Print summary
print(f"Total processed: {len(results_df)}")
print(f"Safe: {len(results_df[results_df['classification'] == 'Safe'])}")
print(f"Unsafe: {len(results_df[results_df['classification'] == 'Unsafe'])}")
print(f"Errors: {len(results_df[results_df['classification'] == 'Error'])}")