In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from openai import OpenAI
import base64
from tqdm import tqdm
import time
import json
import re
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import csv
import shutil

### Across Articles' Image comparison with GPT-4o

Here we first supply the 'training' similarity scores in the form of a matrix for the images given in the train folder<br>The matrix for n images is an n x n *symmetric* matrix with (nC2) comparisons
e.g,
|       | img0  | img1  | 
|-------|-------|-------|
| **img0** | 1.0   | x  | 
| **img1** | x  | 1.0   | 


This similarity matrix serves as the few shot example training for the LLM<br>Feel free to change/add to the training images in the train folder and redefine the training similarity matrix

**Note: api calls charged per usage ~0.5 usd for every 5 image comparisons (5x5 simlarity matrix - 10 comparisons) 

In [21]:
load_dotenv()
openai_api_key = os.getenv('api_key')
api_key = openai_api_key
gpt_client = OpenAI(api_key=openai_api_key)

In [2]:
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials
from anthropic import AnthropicVertex

key_path = 'caching-436119-3f7e7f2329ed.json'

credentials = Credentials.from_service_account_file(
    key_path,
    scopes=['https://www.googleapis.com/auth/cloud-platform'])

if credentials.expired:
    credentials.refresh(Request())

PROJECT_ID = 'caching-436119'
REGION = 'us-east5'

claude_client = AnthropicVertex(project_id=PROJECT_ID, region=REGION, credentials=credentials)

In [3]:
def extract_number(filename, article_num=False):
    match = re.search(r'image_(\d+)_(\d+)', filename) 
    group_num = 1 if article_num else 2
    return int(match.group(group_num))

def load_image_as_base64(image_path):
    with open(image_path, 'rb') as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

def read_matrix_from_csv(filepath):
    with open(filepath, 'r') as csv_file:
        reader = csv.reader(csv_file)
        matrix = [list(map(float, row)) for row in reader]
    return np.array(matrix)

In [11]:
def find_similar_category(category, train_dir='train', model_type='gpt'):
    categories = [d for d in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, d))]
    prompt = f"Which of these categories: {', '.join(categories)} is most similar to '{category}'? You CANNOT choose a category not mentioned in the given list \
    Respond ONLY with the category name without any additional words or punctuation."

    try:
        if model_type == 'gpt':
            response = gpt_client.chat.completions.create(
                model="gpt-4o",  
                messages=[{"role": "user", "content": prompt}],
                max_tokens=10
            )
            similar_category = response.choices[0].message['content'].strip()
        
        elif model_type == 'claude':
            response = claude_client.messages.create(
                model="claude-3-5-sonnet@20240620",
                max_tokens=10,
                stream = False,
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ]
            )
            similar_category = response.content[0].text
    
    except Exception as e:
        print(f"Error in finding similar category: {e}")
        return None
    
    return similar_category

In [6]:
def few_shot(category, static_folder=None, model_type='gpt'):
    if static_folder is None:
        similar_category = find_similar_category(category, model_type=model_type)
        print(f'category similar to {category}: {similar_category}')
        images_path = os.path.join('train', similar_category)
    else:
        images_path = static_folder

    labels_csv = os.path.join(images_path, 'labels.csv')
    similarity_csv = os.path.join(images_path, 'similarity.csv')

    image_paths = sorted(
        [os.path.join(images_path, f) for f in os.listdir(images_path) if f.endswith(('jpg', 'jpeg'))],
        key=extract_number
    )
    
    labels = pd.read_csv(labels_csv)
    similarity_scores = read_matrix_from_csv(similarity_csv)
    few_shot_examples = create_examples(image_paths, labels, similarity_scores, model_type)
    
    return few_shot_examples

def create_examples(image_paths, labels, similarity_scores, model_type='gpt'):
    few_shot_examples = []
    for i in range(len(image_paths)):
        for j in range(i + 1, len(image_paths)):
            base64_img1 = load_image_as_base64(image_paths[i])
            base64_img2 = load_image_as_base64(image_paths[j])
            
            img1_number = extract_number(image_paths[i])
            img2_number = extract_number(image_paths[j])
            
            alt_text1 = labels.loc[labels['image number'] == img1_number, 'alt'].values[0]
            heading1 = labels.loc[labels['image number'] == img1_number, 'article_heading'].values[0]
            alt_text2 = labels.loc[labels['image number'] == img2_number, 'alt'].values[0]
            heading2 = labels.loc[labels['image number'] == img2_number, 'article_heading'].values[0]

            image1_content = {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_img1}"}
            }
            image2_content = {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_img2}"}
            }
            
            if model_type == 'claude':
                image1_content = {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": base64_img1
                    }
                }
                image2_content = {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": base64_img2
                    }
                }

            few_shot_examples.append({
                "role": "user",
                "content": [
                {
                    "type": "text",
                    "text": f"Image 1 Alt Text: {alt_text1}\nImage 1 Heading: {heading1}"
                },
                image1_content,
                {
                    "type": "text",
                    "text": f"Image 2 Alt Text: {alt_text2}\nImage 2 Heading: {heading2}"
                },
                image2_content,
                {
                    "type": "text",
                    "text": f"Question: Compute semantic similarity score (on 0-4 scale) for the above pair of images,\
                        considering the content, context, alt text, and article headings i.e, how replaceable is one image \
                        with the another (0-4 scale). \nAnswer: {similarity_scores[i][j]}"
                }]
            })
    return few_shot_examples



In [7]:
def compare_images(image_paths, labels, few_shot_examples, model_type='gpt'):
    user_messages = []

    for i in range(len(image_paths)):
        for j in range(i + 1, len(image_paths)):
            article_num1 = extract_number(image_paths[i], article_num=True)
            article_num2 = extract_number(image_paths[j], article_num=True)

            if article_num1 == article_num2:
                user_messages.append('')
                continue

            base64_img1 = load_image_as_base64(image_paths[i])
            base64_img2 = load_image_as_base64(image_paths[j])

            if model_type == 'gpt':
                image_content1 = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_img1}"}}
                image_content2 =  {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_img2}"}}
            elif model_type == 'claude':
                image_content1 = {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": base64_img1  
                    }
                }
                image_content2 = {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": base64_img2 
                    }
                }
            
            img1_number = extract_number(image_paths[i]) 
            img2_number = extract_number(image_paths[j])
            alt_text1 = labels.loc[(labels['article_number'] == article_num1) & (labels['image number'] == img1_number), 'alt'].values[0]
            heading1 = labels.loc[(labels['article_number'] == article_num1) & (labels['image number'] == img1_number), 'article_heading'].values[0]

            alt_text2 = labels.loc[(labels['article_number'] == article_num2) & (labels['image number'] == img2_number), 'alt'].values[0]
            heading2 = labels.loc[(labels['article_number'] == article_num2) & (labels['image number'] == img2_number), 'article_heading'].values[0]

            user_messages.append({
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Image 1 Alt Text: {alt_text1}\nImage 1 Heading: {heading1}"},
                    image_content1,

                    {"type": "text", "text": f"Image 2 Alt Text: {alt_text2}\nImage 2 Heading: {heading2}"},
                    image_content2, 

                    {"type": "text",
                    "text": "Question: On a scale from 0 to 4 (0: Not replaceable, 1: Slightly replaceable, 2: Moderately replaceable, \
                        3: Very replaceable, 4: Completely replaceable), rate the similarity of these two images based on the images themselves, \
                        their alt text descriptions, and the article headings where they are used. Don't be too generous with the ratings. Please respond with ONLY the numerical score, \
                        without any additional text or punctuation. "}
                ]
            })

    responses = []
    for message in tqdm(user_messages, desc="Processing image pairs"):
        if not message:
            responses.append(0)
            continue
        
        try:    
            if model_type == 'gpt':
                response = gpt_client.chat.completions.create(
                    model="gpt-4o", 
                    messages=few_shot_examples + [message],
                    max_tokens=10
                )
                res = response.choices[0].message['content'].strip()

            elif model_type == 'claude':
                response = claude_client.messages.create(
                    model="claude-3-5-sonnet@20240620",
                    max_tokens=10,
                    messages=[
                        {
                            "role": "user",
                            "content": message['content'],
                        }
                    ],
                    stream=False
                )
                res = response.content[0].text

            score = re.findall(r'-?\d*\.?\d+', res)[0]
            responses.append(int(score))
            time.sleep(15)

        except Exception as e:
            print(e)
            responses.append(0)
        
    return responses

In [8]:
def get_filename(image_path):
    return os.path.splitext(os.path.basename(image_path))[0]

def make_matrix(image_paths, similarity_scores):
    image_names = [get_filename(path) for path in image_paths]
    
    n = len(image_paths)
    similarity_matrix = [[4 if i == j else 0 for j in range(n)] for i in range(n)]
    
    idx = 0
    for i in range(n):
        for j in range(i + 1, n):
            similarity_matrix[i][j] = int(similarity_scores[idx])
            similarity_matrix[j][i] = int(similarity_scores[idx])
            idx += 1
    
    df = pd.DataFrame(similarity_matrix, index=image_names, columns=image_names)
    
    return df

def process_categories(base_dir, static_folder=None, zero_shot=False, model_type='gpt'):
    def get_dirs(base_dir):
        return [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

    website_folders = get_dirs(base_dir)
    

    for website in website_folders:
        website_dir = os.path.join(base_dir, website)
        category_folders = get_dirs(website_dir)

        for category in category_folders:
            category_dir = os.path.join(website_dir, category)
            labels = pd.read_csv(os.path.join(category_dir, 'image_data.csv'))

            image_paths = sorted(
                [os.path.join(category_dir, f) for f in os.listdir(category_dir) if f.endswith(('jpg', 'jpeg'))],
                key=lambda x: (extract_number(x, article_num=True), extract_number(x))
            )
            if len(image_paths) < 2:
                continue

            few_shot_examples = []
            if not zero_shot:
                few_shot_examples = few_shot(category, static_folder, model_type=model_type)

            similarity_scores = compare_images(image_paths, labels, few_shot_examples, model_type=model_type)
            similarity_df = make_matrix(image_paths, similarity_scores)

            few_shot_out = 'with_fewshot' if not zero_shot else f'without_fewshot'
            done_testing_dir = os.path.join(base_dir, '..', f'done_testing/{few_shot_out}', website)
            os.makedirs(done_testing_dir, exist_ok=True)

            shutil.move(category_dir, os.path.join(done_testing_dir, category))
            similarity_df.to_csv(f'pred_labels/{few_shot_out}/{model_type}/{website}_{category}.csv')
            
        shutil.rmtree(website_dir)

#### Helper functions to compute RMSE

In [26]:
def compute_rmse_with_filter(pred_df, true_df, tolerance=0):
    valid_pairs_pred = []
    valid_pairs_true = []
    
    columns = pred_df.columns
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            img_i = columns[i].split('_')[1]
            img_j = columns[j].split('_')[1]
            
            if img_i != img_j:
                valid_pairs_pred.append(pred_df.iloc[i, j])
                valid_pairs_true.append(true_df.iloc[i, j])

    if valid_pairs_pred and valid_pairs_true:
        squared_errors = []
        for pred_val, true_val in zip(valid_pairs_pred, valid_pairs_true):
            difference = abs(pred_val - true_val)
            error = max(0, difference - tolerance)
            squared_errors.append(error ** 2)

        if squared_errors:
            mse = np.mean(squared_errors)
            rmse = np.sqrt(mse)
            return rmse/4.0
    return None

def compute_average_rmse(pred_dir, true_dir, tolerance=0):
    rmse_scores = []

    pred_files = [f for f in os.listdir(pred_dir) if f.endswith('.csv')]

    for pred_file in pred_files:
        parts = pred_file.split('_')
        website_name = parts[0]
        category = "_".join(parts[1:]).replace('.csv', '')

        true_file = os.path.join(true_dir, website_name, f'{website_name} - {category}.csv')

        if os.path.exists(true_file):
            pred_df = pd.read_csv(os.path.join(pred_dir, pred_file), index_col=0)
            true_df = pd.read_csv(true_file, index_col=0)

            if pred_df.shape == true_df.shape:
                rmse = compute_rmse_with_filter(pred_df, true_df, tolerance=tolerance)
                if rmse is not None:
                    rmse_scores.append(rmse)
                    print(f"RMSE for {website_name} - {category}: {rmse}")
                else:
                    print(f"No valid pairs found for {website_name} - {category}")
            else:
                print(f"Shape mismatch for {website_name} - {category}")
        else:
            print(f"True file not found for {website_name} - {category}")

    if rmse_scores:
        avg_rmse = sum(rmse_scores) / len(rmse_scores)
        print(f"Average RMSE: {avg_rmse}")
        return avg_rmse
    else:
        print("No RMSE scores were computed.")
        return None

#### Compute Weighted Cohen's Kappa as error measure

In [2]:
from sklearn.metrics import cohen_kappa_score

def compute_kappa_with_filter(pred_df, true_df, weights):
    valid_pairs_pred = []
    valid_pairs_true = []
    
    columns = pred_df.columns
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):  
            img_i = columns[i].split('_')[1]
            img_j = columns[j].split('_')[1]
            
            if img_i != img_j: 
                valid_pairs_pred.append(pred_df.iloc[i, j])
                valid_pairs_true.append(true_df.iloc[i, j])

    if valid_pairs_pred and valid_pairs_true:
        return cohen_kappa_score(valid_pairs_true, valid_pairs_pred, weights=weights)
    else:
        return None

def compute_average_kappa(pred_dir, true_dir, weights="linear"):
    kappa_scores = []

    pred_files = [f for f in os.listdir(pred_dir) if f.endswith('.csv')]

    for pred_file in pred_files:
        parts = pred_file.split('_')
        website_name = parts[0]
        category = "_".join(parts[1:]).replace('.csv', '')

        true_file = os.path.join(true_dir, website_name, f'{website_name} - {category}.csv')
        
        if os.path.exists(true_file):
            pred_df = pd.read_csv(os.path.join(pred_dir, pred_file), index_col=0)
            true_df = pd.read_csv(true_file, index_col=0)
            
            if pred_df.shape == true_df.shape:
                kappa = compute_kappa_with_filter(pred_df, true_df, weights=weights)
                if kappa is not None:
                    kappa_scores.append(kappa)
                    print(f"Cohen's Kappa for {website_name} - {category}: {kappa}")
            else:
                print(f"Shape mismatch for {website_name} - {category}")
        else:
            print(f"True file not found for {website_name} - {category}")

    if kappa_scores:
        avg_kappa = sum(kappa_scores) / len(kappa_scores)
        print(f"Average Cohen's Kappa: {avg_kappa}")
        return avg_kappa
    else:
        print("No Kappa scores were computed.")
        return None


#### Compute Average of True Labels between two raters and ICC score

In [26]:
def write_matrix_to_csv(output_file, avg_matrix):
    with open(output_file, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerows(avg_matrix.astype(int))

def compute_average_matrix(matrix1, matrix2):
    return np.round((matrix1 + matrix2) / 2, 0)

def extract_base_filename(filename):
    base_name, _ = os.path.splitext(filename) 
    return re.sub(r'\d+$', '', base_name)

def process_labels(base_dir):
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        
        if not os.path.isdir(category_path):
            continue
        
        files = sorted([f for f in os.listdir(category_path) if f.endswith('.csv')])

        for i in range(0, len(files), 2):
            file1 = files[i]
            file2 = files[i+1] if i+1 < len(files) else None
            
            if file2 is None:
                continue
            
            matrix1 = read_matrix_from_csv(os.path.join(category_path, file1))
            matrix2 = read_matrix_from_csv(os.path.join(category_path, file2))
            
            avg_matrix = compute_average_matrix(matrix1, matrix2)

            output_file = f'true_labels/{category}_{extract_base_filename(file1)}.csv'
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            
            write_matrix_to_csv(output_file, avg_matrix)
    
    print('Average similarity matrices(labels) written to files')

#process_labels('true_labels/old')

In [None]:
import pingouin as pg

def flatten_upper_triangle(matrix):
    return matrix[np.triu_indices_from(matrix, 1)]

def icc(flat_matrices_by_rater):
    df = pd.DataFrame(flat_matrices_by_rater)
    
    df['Target'] = df.index 
    df_long = pd.melt(df, id_vars='Target', var_name='Rater', value_name='Similarity')
    
    icc_results = pg.intraclass_corr(data=df_long, targets='Target', raters='Rater', ratings='Similarity')
    icc_value = icc_results.iloc[0]['ICC']
    
    return round(icc_value, 2)

def find_icc_per_website(base_dir):
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        
        if not os.path.isdir(category_path):
            continue
        
        files = sorted([f for f in os.listdir(category_path) if f.endswith('.csv')])

        matrices_by_rater = {}
        
        for file in files:
            filename, _ = os.path.splitext(file)
            rater = filename[-1]
            
            if rater not in matrices_by_rater:
                matrices_by_rater[rater] = []
            
            matrix = read_matrix_from_csv(os.path.join(category_path, file))
            matrices_by_rater[rater].append(matrix)
        
        flat_matrices_by_rater = {}
        
        for rater, matrices in matrices_by_rater.items():
            flat_matrices = [flatten_upper_triangle(matrix) for matrix in matrices]
            flat_data = np.concatenate(flat_matrices)
            flat_matrices_by_rater[f'Rater_{rater}'] = flat_data
        
        if len(flat_matrices_by_rater) > 1:
            inter_rater_compatibility = icc(flat_matrices_by_rater)
            print(f'Website: {category}')
            print(f'Inter-rater reliability (ICC): {inter_rater_compatibility}')
        else:
            print(f'Not enough matrices to compute ICC for category: {category}')

find_icc_per_website('true_labels')

In [16]:
test_dir = 'test'
process_categories(test_dir, static_folder=None, model_type='claude') # dynamic few shot

Processing image pairs: 100%|██████████| 78/78 [23:49<00:00, 18.33s/it]


In [23]:
pred_dir = 'pred_labels/with_fewshot/claude/'
true_dir = 'true_labels/new/'

average_kappa = compute_average_kappa(pred_dir, true_dir, weights="quadratic")

Cohen's Kappa for NDTV - Education: 0.4130434782608695
Cohen's Kappa for NDTV - People: 0.37823834196891193
Cohen's Kappa for NDTV - Auto: 0.31411530815109345
Cohen's Kappa for NDTV - India: 0.5029239766081872
Cohen's Kappa for apnnews.com - NFL: 0.09021856303189346
Cohen's Kappa for edition.cnn.com - climate_solutions: 0.04453591009212665
Cohen's Kappa for apnnews.com - Music: 0.0
Cohen's Kappa for NDTV - Science: 0.784
Cohen's Kappa for nbcnews.com - culture: 0.0
Cohen's Kappa for apnnews.com - Fact Check: 0.1629569012547737
Cohen's Kappa for apnnews.com - US Supreme Court: 0.77947932618683
Cohen's Kappa for edition.cnn.com - sleep: 0.4419667336867117
Cohen's Kappa for nbcnews.com - business: 0.530842745438749
Cohen's Kappa for nbcnews.com - 2024_elections: 0.30219435736677125
Cohen's Kappa for edition.cnn.com - politics_congress: 0.5435984687367077
Cohen's Kappa for edition.cnn.com - world_middleeast_israel: 0.3586834957660969
Cohen's Kappa for apnnews.com - Health: 0.56633045340380

In [27]:
rmse = compute_average_rmse(pred_dir, true_dir)

RMSE for NDTV - Education: 0.18257418583505536
RMSE for NDTV - People: 0.14907119849998599
RMSE for NDTV - Auto: 0.17873008824606013
RMSE for NDTV - India: 0.18633899812498247
RMSE for apnnews.com - NFL: 0.3529241795756714
RMSE for edition.cnn.com - climate_solutions: 0.2023192986137242
RMSE for apnnews.com - Music: 0.09128709291752768
RMSE for NDTV - Science: 0.12909944487358055
RMSE for nbcnews.com - culture: 0.09415544714433868
RMSE for apnnews.com - Fact Check: 0.3312342935223347
RMSE for apnnews.com - US Supreme Court: 0.1767766952966369
RMSE for edition.cnn.com - sleep: 0.13757200419426974
RMSE for nbcnews.com - business: 0.15214515486254615
RMSE for nbcnews.com - 2024_elections: 0.27094777801764386
RMSE for edition.cnn.com - politics_congress: 0.22132891019134054
RMSE for edition.cnn.com - world_middleeast_israel: 0.1739800514310883
RMSE for apnnews.com - Health: 0.14223909013659572
RMSE for edition.cnn.com - markets_nightcap: 0.11180339887498948
RMSE for nbcnews.com - science_s

### Value Add of Images IN-ARTICLE

In [None]:
def get_article_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    article_links = []

    for a_tag in soup.find_all('a', href=True):
        if 'article' in a_tag['href']:
            heading = a_tag.get('aria-label', None) or a_tag.text.strip()
            if heading:  
                article_links.append({
                    'url': a_tag['href'],
                    'heading': heading
                })

    for heading_tag in soup.find_all(['h1', 'h2', 'h3']):
        try:
            a_tag = heading_tag.find('a', href=True)
            if a_tag in a_tag['href']:
                heading = heading_tag.text.strip()
                if heading: 
                    if not any(link['url'] == a_tag['href'] for link in article_links):
                        article_links.append({
                            'url': a_tag['href'],
                            'heading': heading
                        })
        except:
            pass

    return article_links

def scrape_article(url, heading):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    article_heading = heading

    images = []
    for img in soup.find_all('img'):
        if img.has_attr('width') and img.has_attr('height'):
            width = float(img['width'])
            height = float(img['height'])
            if width > 150 and height > 150:
                images.append({
                    'image_url': img['src'],
                    'image_alt': img.get('alt', 'No Alt')
                })
    
    main_content = soup.find(id='main-content') or soup.find(id='maincontent') or soup.find(class_='main-content') or soup.find(class_='maincontent')
    
    content = ''
    if main_content:
        paragraphs = main_content.find_all('p')
        content = ' '.join([p.text for p in paragraphs])

    return {
        'article_url': url,
        'heading': article_heading,
        'images': images,
        'content': content
    }

In [None]:
base_url = 'https://www.theguardian.com/uk'
article_links = get_article_links(base_url)

scraped_data = []
for article in article_links:
    full_url = article['url'] if article['url'].startswith('http') else urljoin(base_url,article["url"])
    article_data = scrape_article(full_url, article['heading'])
    scraped_data.append(article_data)

df = pd.DataFrame(scraped_data)
df = df.drop_duplicates(subset='article_url').reset_index(drop=True)

base_url_name = urlparse(base_url).netloc.split('.')[1]
df.to_csv(f"article_csvs/{base_url_name}.csv", index=False)

In [None]:
df = df[df['images'].apply(lambda x: len(x) > 1)].reset_index(drop=True)

df.head()

In [None]:
import ast

def convert_to_list(text):
    return ast.literal_eval(text)

true_labels = pd.read_csv('article_csvs/true_labels.csv', converters={'true_labels': convert_to_list})

true_labels.head()

In [None]:
load_dotenv()
openai_api_key = os.getenv('api_key')
client = OpenAI(api_key=openai_api_key)

few_shot_example = merged_df.iloc[5]
test_df = merged_df.drop(index=5).reset_index(drop=True)

few_shot_prompt = (
    f"Imagine you are an editor evaluating the contribution of images to an article. "
    f"Your task is to rate each image based on how much it enhances the reader's understanding and engagement with the article. "
    f"Consider the following criteria for rating each image:\n"
    f"- **0: Not Important**: The image does not add any value to the article, is redundant, or is irrelevant.\n"
    f"- **1: Slightly Important**: The image adds minimal value, such as illustrating a minor point or providing a generic visual.\n"
    f"- **2: Somewhat Important**: The image supports the content but is not crucial for understanding the article.\n"
    f"- **3: Moderately Important**: The image is helpful in conveying key ideas or themes, making the article more engaging.\n"
    f"- **4: Very Important**: The image is essential, significantly enhancing the article by illustrating a central point, setting context, or evoking strong emotions.\n\n"
    f"Please consider the following factors when rating each image:\n"
    f"- **Relevance**: How directly does the image relate to the article's main content?\n"
    f"- **Visual Impact**: How much does the image contribute to the reader's emotional engagement or understanding of the article?\n"
    f"- **Context**: Is the image contextually appropriate, and does it fit well within the article's narrative?\n"
    f"- **Redundancy**: Avoid giving high ratings to images that repeat information or perspectives already provided by previous images.\n\n"
    f"Here is an example to guide your ratings:\n\n"
    f"**Article Heading**: {few_shot_example['heading']}\n"
    f"**Article URL**: {few_shot_example['article_url']}\n"
    f"**Content**: {few_shot_example['content']}\n\n"
    "### Images and their Alt Texts:\n"
)

for idx, image_info in enumerate(few_shot_example['images']):
    few_shot_prompt += (
        f"Image {idx + 1} URL: {image_info['image_url']}\n"
        f"Image {idx + 1} Alt Text: {image_info['image_alt']}\n"
    )

few_shot_prompt += f"\nTrue Labels: {few_shot_example['true_labels']}\n"

predicted_labels = []

for index, row in test_df.iterrows():
    prompt = (
        f"{few_shot_prompt}\n\n"
        f"Now, please rate the images for the following article in the same way (ONLY respond with the ratings - don't add any words):\n"
        f"Article Heading: {row['heading']}\n"
        f"Article URL: {row['article_url']}\n"
        f"Content: {row['content']}\n\n"
        "Images and their Alt Texts:\n"
    )

    for idx, image_info in enumerate(row['images']):
        prompt += (
            f"Image {idx + 1} URL: {image_info['image_url']}\n"
            f"Image {idx + 1} Alt Text: {image_info['image_alt']}\n"
        )

    prompt += "\nPlease provide your ratings:"

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an assistant that rates image importance in news articles."},
            {"role": "user", "content": prompt}
        ]
    )

    predicted_rating = response.choices[0].message.content.strip()
    cleaned_rating = predicted_rating.replace('[', '').replace(']', '')
    predicted_labels.append([int(x.strip()) for x in cleaned_rating.split(',')])


test_df.loc[:, 'pred_labels'] = predicted_labels

test_df.head()

In [None]:
def compute_rmse(df, tolerance=0):
    total_squared_errors = []

    for index, row in df.iterrows():
        true_labels = row['true_labels']
        pred_labels = row['pred_labels']
        
        if len(true_labels) != len(pred_labels):
            raise ValueError(f"Length of true_labels and pred_labels do not match for index {index}.")

        squared_errors = []
        for true_label, pred_label in zip(true_labels, pred_labels):
            difference = abs(pred_label - true_label)
            error = max(0, difference - tolerance)
            squared_errors.append(error ** 2)
        
        total_squared_errors.extend(squared_errors)

    mse = np.mean(total_squared_errors)
    rmse = np.sqrt(mse)
    
    return rmse / 4.0

In [None]:
compute_rmse(test_df, tolerance=1)