In [8]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from openai import OpenAI
import base64
from tqdm import tqdm
import time
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from dotenv import load_dotenv
from io import BytesIO
from PIL import Image
import numpy as np
import pandas as pd
import csv
from datetime import datetime

### Web scraping to download images per category in any given news website

In [11]:
def get_news_categories(url):
    # news categories (and associated href) fetched via nav components
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching the URL: {e}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    
    categories = []
    navs = soup.find_all('nav')

    for nav in navs:
        for link in nav.find_all('a'):
            category = link.get_text(strip=True)
            category_url = link.get('href')
            if category and category_url:
                categories.append((category, urljoin(url, category_url)))

    return categories

def create_directories(base_url, categories):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join("outputs", urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    
    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [12]:
def download_image(img_url, save_dir, img_name):
    try:
        if img_url.startswith('data:'):
            save_data_uri_image(img_url, save_dir, img_name)
        else:
            save_url_image(img_url, save_dir, img_name)

    except Exception as e:
        print(f'Error downloading image {img_url}: {e}')

def get_extension_from_header(header):
    if "image/jpeg" in header:
        return ".jpg"
    elif "image/png" in header:
        return ".png"
    elif "image/xml" in header:
        return ".xml"
    return None

def save_data_uri_image(img_url, save_dir, img_name):
    header, encoded = img_url.split(',', 1)
    data = base64.b64decode(encoded)
    ext = get_extension_from_header(header)
    if ext:
        img_name = img_name.split('.')[0] + ext
        img_path = os.path.join(save_dir, img_name)
        with open(img_path, 'wb') as f:
            f.write(data)

def save_url_image(img_url, save_dir, img_name):
    response = requests.get(img_url)
    response.raise_for_status()

    img_path = os.path.join(save_dir, img_name)
    with open(img_path, 'wb') as f:
        f.write(response.content)

In [89]:
def process_article(link_tup, base_url, category_url, save_dir, csv_writer):
    article_number, link = link_tup

    article_url = urljoin(category_url, link.get('href'))
    try:
        article_response = requests.get(article_url)
        article_response.raise_for_status()

        article_soup = BeautifulSoup(article_response.content, 'html.parser')
        article_heading = extract_heading(link, article_soup)
        
        images = article_soup.find_all('img', src=True)
        ############# SPECIFIC FOR WEBSITE #####################
        images_first_filter = [
                img for img in images
                if img.find_parent('main', class_='article__main') and
                not img.find_parent('a', class_='related-content__link') and
                not img.find_parent('div', class_='video-inline_carousel__wrapper')
        ]
        ############# SPECIFIC FOR WEBSITE #####################

        images_to_download = filter_images(images_first_filter)
        download_and_record_images(article_number, images_to_download, base_url, article_url, save_dir, csv_writer, article_heading)

    except Exception as e:
        print(f'Error processing the article {article_url}: {e}')

def extract_heading(link, soup):
    heading = link.get('aria-label')
    if not heading:
        heading = soup.find('h1').get_text(strip=True)
    if not heading:
        heading = soup.find('h2').get_text(strip=True)
    if not heading:
        meta_title = soup.find('meta', attrs={'property': 'og:title'}).get_text(strip=True)
        if meta_title:
            return meta_title.get('content').strip()
    if not heading:
        heading = soup.find('title').get_text(strip=True)

    return heading if heading else 'No Heading'

def download_and_record_images(article_number, images_to_download, base_url, article_url, save_dir, csv_writer, article_heading):
    count=0
    for img in images_to_download:
        img_url = urljoin(article_url, img['src'])

        img_name = f'image_{article_number}_{count}.jpg'
        count+=1

        alt_text = img.get('alt', '')

        download_image(img_url, save_dir, img_name)
        csv_writer.writerow([base_url, os.path.basename(save_dir), article_number, count, alt_text, article_heading, article_url])
        
        time.sleep(0.001)

processed_articles = set()  
def download_images(base_url, category_url, save_dir, csv_writer):
    response = requests.get(category_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    
    ##################### GET ARTICLE LINKS SPECIFIC TO WEBSITE ###################
    def extract_date_from_url(a_tag):
        url = a_tag['href']
        try:
            date_part = url.split("/")[1:4] 
            date_str = "-".join(date_part)
            return datetime.strptime(date_str, "%Y-%m-%d")
        except:
            return datetime.min
    
    articles = set()
    div = soup.find('section', class_='layout__main layout-no-rail__main')
    for link in div.find_all('a', href=True):
        if '/2024/' in link['href'] and link['href'] not in processed_articles:  
            articles.add(link)
            processed_articles.add(link['href'])
    
    sorted_links = sorted(articles, key=extract_date_from_url, reverse=True)
    article_dict = {i+1: link for i, link in enumerate(sorted_links[:11])}  # Limit to 10
    ####################################################################################

    for link_dict in article_dict.items():
        process_article(link_dict, base_url, category_url, save_dir, csv_writer)
        _, link = link_dict

def filter_images(images):
    images_to_download = []
    for img in images:
        width = img.get('width')
        img_url = img.get('src', '')

        if not width:
            width_from_url = re.search(r'width=(\d+)', img_url)
            
            if width_from_url:
                width = width_from_url.group(1)
            
            if not width:
                continue

        width_is_large = ('%' in width and float(width.replace('%', '')) > 60) or (width.isdigit() and float(width) > 250)

        if width_is_large:
            images_to_download.append(img)

    return images_to_download

In [90]:
def setup_csv(base_dir):
    csv_file_path = os.path.join(base_dir, 'image_data.csv')
    csv_file = open(csv_file_path, mode='w', newline='', encoding='utf-8')

    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['website', 'category', 'article_number', 'image number', 'alt', 'article_heading', 'article_url'])
    
    return csv_file, csv_writer

def filter_categories(categories, exclude_keywords):
    filtered_categories = []
    for category, category_url in categories:
        category_words = category.lower().split()
        if not any(keyword in category_words for keyword in exclude_keywords):
            filtered_categories.append((category, category_url))
    return filtered_categories

exclude_keywords = ['sign', 'login', 'subscribe', 'advertisement', 'privacy', 'terms', 'contact']

In [91]:
categories = [
    ("business", "https://edition.cnn.com/business"),
    ("sports", "https://edition.cnn.com/sport"),
    ("world", "https://edition.cnn.com/world"),
    ("entertainment", "https://edition.cnn.com/entertainment"),
    ("science", "https://edition.cnn.com/science")
]

base_url = "https://edition.cnn.com"
base_dir = create_directories(base_url, categories)

csv_file, csv_writer = setup_csv(base_dir)

for category, category_url in tqdm(categories, desc='Downloading images for each category'):
    category_dir = os.path.join(base_dir, category.replace('/', '_'))
    os.makedirs(category_dir, exist_ok=True)
    download_images(base_url, category_url, category_dir, csv_writer)
    
csv_file.close()

Downloading images for each category: 100%|██████████| 5/5 [01:17<00:00, 15.53s/it]


In [21]:
def download_images(category_url):
    
    response = requests.get(category_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    
    ##################### GET ARTICLE LINKS SPECIFIC TO WEBSITE ###################
    def extract_date_from_url(url):
        try:
            date_part = url.split("/")[1:4] 
            date_str = "-".join(date_part)
            return datetime.strptime(date_str, "%Y-%m-%d")
        except:
            return datetime.min
    
    articles = set()
    div = soup.find('section', class_='layout__main layout-no-rail__main')
    for link in div.find_all('a', href=True):
        if '/2024/' in link['href']:  
            articles.add(link['href'])
    
    sorted_links = sorted(articles, key=extract_date_from_url, reverse=True)
    article_links = sorted_links[0:15]  # Limit to first 15 articles for each category
    ####################################################################################
    for link in article_links:
        print(link)

download_images("https://edition.cnn.com/business")

/2024/09/05/tech/verizon-frontier-deal/index.html
/2024/09/05/business/burberry-ftse-100-drop/index.html
/2024/09/05/business/trump-economy-tariffs/index.html
/2024/09/05/tech/nvidia-stock-falling-nightcap/index.html
/2024/09/04/tech/from-the-river-to-the-sea-meta-oversight-board/index.html
/2024/09/04/business/chipotle-spirit-halloween-costume/index.html
/2024/09/04/business/cpsc-shein-temu-investigation/index.html
/2024/09/04/business/us-steel-nippon-steel-chances/index.html
/2024/09/04/economy/us-jolts-job-openings-hires-layoffs-july/index.html
/2024/09/04/economy/economy-jobs-report-harris-trump-nightcap/index.html
/2024/09/04/tech/brazil-lula-elon-musk-x-suspension-hnk-intl/index.html
/2024/09/04/business/ll-flooring-liquidation/index.html
/2024/09/04/business/hotel-strike-san-diego/index.html
/2024/09/04/business/dollar-stores-walmart-low-income-consumers/index.html
/2024/09/04/tech/nvidia-is-in-trouble/index.html


In [None]:
# folders missing images
main_dir = os.path.join(os.getcwd(), 'outputs/theguardian.com')
total = 0
for dir in os.listdir(main_dir):
    full_path = os.path.join(main_dir, dir)  
    if os.path.isdir(full_path):
        if len(os.listdir(full_path)) == 0:
            total+=1
            print(dir)

print(total)

Change the base url to any news website for which you want to download images (for every category) 

Tried for: thegaurdian, time.com, tribune.pk

### Feeding images to GPT-4o (chat completion module) for their similarity scores

Here we first supply the 'training' similarity scores in the form of a matrix for the images given in the train folder<br>The matrix for n images is an n x n *symmetric* matrix with (nC2) comparisons
e.g,
|       | img0  | img1  | 
|-------|-------|-------|
| **img0** | 1.0   | x  | 
| **img1** | x  | 1.0   | 


This similarity matrix serves as the few shot example training for the LLM<br>Feel free to change/add to the training images in the train folder and redefine the training similarity matrix

**Note: api calls charged per usage ~0.5 usd for every 5 image comparisons (5x5 simlarity matrix - 10 comparisons) 

In [3]:
load_dotenv()
openai_api_key = os.getenv('api_key')
api_key = openai_api_key

In [4]:
def extract_number(filename):
    match = re.search(r'image_(\d+)', filename)
    return int(match.group(1)) if match else float('inf')

def load_image_as_base64(image_path):
    with open(image_path, 'rb') as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

def read_matrix_from_csv(filepath):
    with open(filepath, 'r') as csv_file:
        reader = csv.reader(csv_file)
        matrix = [list(map(float, row)) for row in reader]
    return np.array(matrix)

In [5]:
client = OpenAI(api_key=api_key)

def find_similar_category(category, train_dir='train'):
    categories = [d for d in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, d))]
    prompt = f"Which of these categories: {', '.join(categories)} is most similar to '{category}'? Respond ONLY with the category name without any additional words or punctuation."

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[{"role": "user", "content": prompt}],
            max_tokens=10 
        )
        similar_category = response.choices[0].message.content.strip()
        return similar_category
    
    except Exception as e:
        print(f"Error in finding similar category: {e}")
        return None

In [6]:
def few_shot(category, static_folder=None):
    if static_folder is None:
        similar_category = find_similar_category(category)
        
        images_path = os.path.join('train', similar_category)
    else:
        images_path = static_folder

    labels_csv = os.path.join(images_path, 'labels.csv')
    similarity_csv = os.path.join(images_path, 'similarity.csv')

    image_paths = sorted(
        [os.path.join(images_path, f) for f in os.listdir(images_path) if f.endswith(('jpg', 'jpeg', 'png'))],
        key=extract_number
    )
    
    labels = pd.read_csv(labels_csv)
    similarity_scores = read_matrix_from_csv(similarity_csv)
    few_shot_examples = create_examples(image_paths, labels, similarity_scores)
    
    return few_shot_examples

def create_examples(image_paths, labels, similarity_scores):
    few_shot_examples = []
    for i in range(len(image_paths)):
        for j in range(i + 1, len(image_paths)):
            base64_img1 = load_image_as_base64(image_paths[i])
            base64_img2 = load_image_as_base64(image_paths[j])
            
            img1_number = extract_number(image_paths[i])
            img2_number = extract_number(image_paths[j])
            
            alt_text1 = labels.loc[labels['image number'] == img1_number, 'alt'].values[0]
            heading1 = labels.loc[labels['image number'] == img1_number, 'article_heading'].values[0]
            alt_text2 = labels.loc[labels['image number'] == img2_number, 'alt'].values[0]
            heading2 = labels.loc[labels['image number'] == img2_number, 'article_heading'].values[0]

            few_shot_examples.append({
                "role": "user",
                "content": [
                {
                    "type": "text",
                    "text": f"Image 1 Alt Text: {alt_text1}\nImage 1 Heading: {heading1}"
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{base64_img1}"}
                },
                {
                    "type": "text",
                    "text": f"Image 2 Alt Text: {alt_text2}\nImage 2 Heading: {heading2}"
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{base64_img2}"}
                },
                {
                    "type": "text",
                    "text": f"Question: Compute semantic similarity score (on 0-4 scale) for the above pair of images,\
                        considering the content, context, alt text, and article headings i.e, how replaceable is one image \
                        with the another (0-4 scale). \nAnswer: {similarity_scores[i][j]}"
                }]
            })
    return few_shot_examples


In [7]:
def compare_images(image_paths, labels, few_shot_examples):
    user_messages = []

    for i in range(len(image_paths)):
        for j in range(i + 1, len(image_paths)):
            base64_img1 = load_image_as_base64(image_paths[i])
            base64_img2 = load_image_as_base64(image_paths[j])
            
            img1_number = extract_number(image_paths[i])
            img2_number = extract_number(image_paths[j])
            
            alt_text1 = labels.loc[labels['image number'] == img1_number, 'alt'].values[0]
            heading1 = labels.loc[labels['image number'] == img1_number, 'article_heading'].values[0]
            alt_text2 = labels.loc[labels['image number'] == img2_number, 'alt'].values[0]
            heading2 = labels.loc[labels['image number'] == img2_number, 'article_heading'].values[0]

            user_messages.append({
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Image 1 Alt Text: {alt_text1}\nImage 1 Heading: {heading1}"},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_img1}"}},

                    {"type": "text", "text": f"Image 2 Alt Text: {alt_text2}\nImage 2 Heading: {heading2}"},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_img2}"}},

                    {"type": "text",
                    "text": "Question: On a scale from 0 to 4 (0: Not replaceable, 1: Somewhat replaceable, 2: Moderately replaceable, \
                        3: Very replaceable, 4: Completely replaceable), rate the similarity of these two images based on the images themselves, \
                        their alt text descriptions, and the article headings where they are used. Please respond with ONLY the numerical score, \
                        without any additional text or punctuation."}
                ]
            })

    responses = []
    for message in tqdm(user_messages, desc="Processing image pairs"):
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=few_shot_examples + [message],
            max_tokens=10
        )
        time.sleep(1)
        res = response.choices[0].message.content.strip()
        score = re.findall(r'-?\d*\.?\d+', res)[0] 
        responses.append(score)
    
    return responses

In [8]:
def make_matrix(image_paths, similarity_scores):
    n = len(image_paths)
    similarity_matrix = [[4 if i == j else 0 for j in range(n)] for i in range(n)]

    idx = 0
    for i in range(n):
        for j in range(i + 1, n):
            similarity_matrix[i][j] = int(similarity_scores[idx])
            similarity_matrix[j][i] = int(similarity_scores[idx])
            idx += 1

    return similarity_matrix

def process_categories(base_dir, static_folder=None, zero_shot=False):
    category_folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

    for category in category_folders:
        category_path = os.path.join(base_dir, category)
        labels = pd.read_csv(os.path.join(category_path, 'labels.csv'))
        
        image_paths = sorted(
            [os.path.join(category_path, f) for f in os.listdir(category_path) if f.endswith(('jpg', 'jpeg', 'png'))],
            key=extract_number
        )
        if len(image_paths) < 2:
            continue

        few_shot_examples = []
        if not zero_shot:
            few_shot_examples = few_shot(category, static_folder)
            
        similarity_scores = compare_images(image_paths, labels, few_shot_examples)
        similarity_matrix = make_matrix(image_paths, similarity_scores)

        output_file = 'with_fewshot' if not zero_shot else 'without_fewshot'

        with open(f'pred_labels/{output_file}/{category}.csv', 'w', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerows(similarity_matrix)

Helper functions to compute RMSE

In [9]:
def read_matrix_from_csv(filepath):
    with open(filepath, 'r') as csv_file:
        reader = csv.reader(csv_file)
        matrix = [list(map(float, row)) for row in reader]
    return np.array(matrix)

def compute_rmse(pred_folder, labels_folder, tolerance=0):
    test_files = sorted([f for f in os.listdir(pred_folder) if f.endswith('.csv')])
    labels_files = sorted([f for f in os.listdir(labels_folder) if f.endswith('.csv')])

    if len(test_files) != len(labels_files):
        raise ValueError("The number of files in test_folder and labels_folder does not match.")

    total_squared_errors = []

    for test_file in test_files:
        label_file = test_file
        test_path = os.path.join(pred_folder, test_file)
        label_path = os.path.join(labels_folder, label_file)

        if not os.path.isfile(test_path) or not os.path.isfile(label_path):
            raise ValueError(f"Corresponding file for '{test_file}' is missing in one of the folders.")

        test_matrix = read_matrix_from_csv(test_path)
        label_matrix = read_matrix_from_csv(label_path)

        if test_matrix.shape != label_matrix.shape:
            raise ValueError(f"Matrix shapes for '{test_file}' do not match.")

        squared_errors = []
        for row_test, row_label in zip(test_matrix, label_matrix):
            for score_test, score_label in zip(row_test, row_label):
                difference = abs(score_test - score_label)
                error = max(0, difference - tolerance)
                squared_errors.append(error ** 2)

        total_squared_errors.extend(squared_errors)

    mse = np.mean(total_squared_errors)
    rmse = np.sqrt(mse)
    return rmse/4.0

Compute Average of True Labels between two raters and ICC score

In [60]:
def write_matrix_to_csv(output_file, avg_matrix):
    with open(output_file, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerows(avg_matrix.astype(int))

def compute_average_matrix(matrix1, matrix2):
    return np.round((matrix1 + matrix2) / 2, 0)

def extract_base_filename(filename):
    base_name, _ = os.path.splitext(filename) 
    return re.sub(r'\d+$', '', base_name)

def process_labels(base_dir):
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        
        if not os.path.isdir(category_path):
            continue
        
        files = sorted([f for f in os.listdir(category_path) if f.endswith('.csv')])

        for i in range(0, len(files), 2):
            file1 = files[i]
            file2 = files[i+1] if i+1 < len(files) else None
            
            if file2 is None:
                continue
            
            matrix1 = read_matrix_from_csv(os.path.join(category_path, file1))
            matrix2 = read_matrix_from_csv(os.path.join(category_path, file2))
            
            avg_matrix = compute_average_matrix(matrix1, matrix2)

            output_file = f'true_labels/{category}_{extract_base_filename(file1)}.csv'
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            
            write_matrix_to_csv(output_file, avg_matrix)
    
    print('Average similarity matrices(labels) written to files')

process_labels('true_labels')

Average similarity matrices(labels) written to files


In [11]:
import pingouin as pg

def flatten_upper_triangle(matrix):
    return matrix[np.triu_indices_from(matrix, 1)]

def icc(flat_matrices_by_rater):
    df = pd.DataFrame(flat_matrices_by_rater)
    
    df['Target'] = df.index 
    df_long = pd.melt(df, id_vars='Target', var_name='Rater', value_name='Similarity')
    
    icc_results = pg.intraclass_corr(data=df_long, targets='Target', raters='Rater', ratings='Similarity')
    icc_value = icc_results.iloc[0]['ICC']
    
    return round(icc_value, 2)

def find_icc_per_website(base_dir):
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        
        if not os.path.isdir(category_path):
            continue
        
        files = sorted([f for f in os.listdir(category_path) if f.endswith('.csv')])

        matrices_by_rater = {}
        
        for file in files:
            filename, _ = os.path.splitext(file)
            rater = filename[-1]
            
            if rater not in matrices_by_rater:
                matrices_by_rater[rater] = []
            
            matrix = read_matrix_from_csv(os.path.join(category_path, file))
            matrices_by_rater[rater].append(matrix)
        
        flat_matrices_by_rater = {}
        
        for rater, matrices in matrices_by_rater.items():
            flat_matrices = [flatten_upper_triangle(matrix) for matrix in matrices]
            flat_data = np.concatenate(flat_matrices)
            flat_matrices_by_rater[f'Rater_{rater}'] = flat_data
        
        if len(flat_matrices_by_rater) > 1:
            inter_rater_compatibility = icc(flat_matrices_by_rater)
            print(f'Website: {category}')
            print(f'Inter-rater reliability (ICC): {inter_rater_compatibility}')
        else:
            print(f'Not enough matrices to compute ICC for category: {category}')

find_icc_per_website('true_labels')

Website: theguardian
Inter-rater reliability (ICC): 0.95


Computing the similairty matrix for each category in the test folder and saving the results to test.json<br>
Feel free to add to / remove from the test folder - **I first generate images using the web scraping module above and then only pick 2-3 categories with 4-5 images each to test because of the cost of api usage**

***** Please only run the below cell for *new* images added to the test folder because previous results have already been computed and kept in the test.json file - rerunning for the same images will only cost more without any benefit

In [None]:
test_dir = 'test'
process_categories(test_dir, static_folder=None) # dynamic few shot

In [None]:
rmse = compute_rmse(pred_folder = 'pred_labels/with_fewshot', labels_folder = 'true_labels', tolerance=1)
print(f'Normalised rmse (dynamic few shot): {rmse}')

In [None]:
process_categories(test_dir, zero_shot=True) # zero shot

In [92]:
rmse = compute_rmse(pred_folder = 'pred_labels/without_fewshot', labels_folder = 'true_labels', tolerance=1)
print(f'Normalised rmse (zero shot): {rmse}')

Normalised rmse (zero shot): 0.18257418583505536


#### Value Add of Images in a Article

In [26]:
def get_article_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    article_links = []

    for a_tag in soup.find_all('a', href=True):
        if 'article' in a_tag['href']:
            heading = a_tag.get('aria-label', None) or a_tag.text.strip()
            if heading:  
                article_links.append({
                    'url': a_tag['href'],
                    'heading': heading
                })

    for heading_tag in soup.find_all(['h1', 'h2', 'h3']):
        try:
            a_tag = heading_tag.find('a', href=True)
            if a_tag in a_tag['href']:
                heading = heading_tag.text.strip()
                if heading: 
                    if not any(link['url'] == a_tag['href'] for link in article_links):
                        article_links.append({
                            'url': a_tag['href'],
                            'heading': heading
                        })
        except:
            pass

    return article_links

def scrape_article(url, heading):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    article_heading = heading

    images = []
    for img in soup.find_all('img'):
        if img.has_attr('width') and img.has_attr('height'):
            width = float(img['width'])
            height = float(img['height'])
            if width > 150 and height > 150:
                images.append({
                    'image_url': img['src'],
                    'image_alt': img.get('alt', 'No Alt')
                })
    
    main_content = soup.find(id='main-content') or soup.find(id='maincontent') or soup.find(class_='main-content') or soup.find(class_='maincontent')
    
    content = ''
    if main_content:
        paragraphs = main_content.find_all('p')
        content = ' '.join([p.text for p in paragraphs])

    return {
        'article_url': url,
        'heading': article_heading,
        'images': images,
        'content': content
    }


In [59]:
base_url = 'https://www.theguardian.com/uk'
article_links = get_article_links(base_url)

scraped_data = []
for article in article_links:
    full_url = article['url'] if article['url'].startswith('http') else urljoin(base_url,article["url"])
    article_data = scrape_article(full_url, article['heading'])
    scraped_data.append(article_data)

df = pd.DataFrame(scraped_data)
df = df.drop_duplicates(subset='article_url').reset_index(drop=True)

base_url_name = urlparse(base_url).netloc.split('.')[1]
df.to_csv(f"article_csvs/{base_url_name}.csv", index=False)

In [60]:
df = df[df['images'].apply(lambda x: len(x) > 1)].reset_index(drop=True)

df.head()

Unnamed: 0,article_url,heading,images,content
0,https://www.theguardian.com/music/article/2024...,‘Harry would approve’: rail firm protects Ches...,[{'image_url': 'https://i.guim.co.uk/img/media...,It has survived two world wars and nearly 200 ...
1,https://www.theguardian.com/us-news/article/20...,"Kamala Harris’s much-hyped, first big intervie...",[{'image_url': 'https://i.guim.co.uk/img/media...,Donald Trump spent Thursday in Michigan raving...
2,https://www.theguardian.com/global-development...,"‘When life gets hard, you must be harder’: run...",[{'image_url': 'https://i.guim.co.uk/img/media...,The town of Qaim lies on the border of Iraq an...
3,https://www.theguardian.com/society/article/20...,‘I don’t see how it’s enforceable’: pubgoers r...,[{'image_url': 'https://i.guim.co.uk/img/media...,"On an average weekend in Moseley, a suburb of ..."
4,https://www.theguardian.com/music/article/2024...,"‘I’m intrigued by failure’: Kim Deal on death,...",[{'image_url': 'https://i.guim.co.uk/img/media...,In all of her decades as one of rock’s great f...


In [107]:
import ast

def convert_to_list(text):
    return ast.literal_eval(text)

true_labels = pd.read_csv('article_csvs/true_labels.csv', converters={'true_labels': convert_to_list})

true_labels.head()

Unnamed: 0,article_url,true_labels
0,https://www.theguardian.com/technology/article...,"[0, 4, 4, 4, 4, 0, 0]"
1,https://www.theguardian.com/music/article/2024...,"[4, 3, 0, 0]"
2,https://www.theguardian.com/us-news/article/20...,"[0, 2]"
3,https://www.theguardian.com/global-development...,"[3, 2, 1]"
4,https://www.theguardian.com/society/article/20...,"[3, 2, 0, 0]"


In [136]:
merged_df = df.merge(true_labels, on='article_url', how='inner')
merged_df.head()

Unnamed: 0,article_url,heading,images,content,true_labels
0,https://www.theguardian.com/music/article/2024...,‘Harry would approve’: rail firm protects Ches...,[{'image_url': 'https://i.guim.co.uk/img/media...,It has survived two world wars and nearly 200 ...,"[4, 3, 0, 0]"
1,https://www.theguardian.com/us-news/article/20...,"Kamala Harris’s much-hyped, first big intervie...",[{'image_url': 'https://i.guim.co.uk/img/media...,Donald Trump spent Thursday in Michigan raving...,"[0, 2]"
2,https://www.theguardian.com/global-development...,"‘When life gets hard, you must be harder’: run...",[{'image_url': 'https://i.guim.co.uk/img/media...,The town of Qaim lies on the border of Iraq an...,"[3, 2, 1]"
3,https://www.theguardian.com/society/article/20...,‘I don’t see how it’s enforceable’: pubgoers r...,[{'image_url': 'https://i.guim.co.uk/img/media...,"On an average weekend in Moseley, a suburb of ...","[3, 2, 0, 0]"
4,https://www.theguardian.com/film/article/2024/...,"‘You laugh the hardest in grief’: And Mrs, the...",[{'image_url': 'https://i.guim.co.uk/img/media...,Finding the funny side of bereavement may be t...,"[4, 2]"


In [163]:
load_dotenv()
api_key = os.getenv('api_key')
client = OpenAI(api_key=api_key)

few_shot_example = merged_df.iloc[5]
test_df = merged_df.drop(index=5).reset_index(drop=True)

few_shot_prompt = (
    f"Imagine you are an editor evaluating the contribution of images to an article. "
    f"Your task is to rate each image based on how much it enhances the reader's understanding and engagement with the article. "
    f"Consider the following criteria for rating each image:\n"
    f"- **0: Not Important**: The image does not add any value to the article, is redundant, or is irrelevant.\n"
    f"- **1: Slightly Important**: The image adds minimal value, such as illustrating a minor point or providing a generic visual.\n"
    f"- **2: Somewhat Important**: The image supports the content but is not crucial for understanding the article.\n"
    f"- **3: Moderately Important**: The image is helpful in conveying key ideas or themes, making the article more engaging.\n"
    f"- **4: Very Important**: The image is essential, significantly enhancing the article by illustrating a central point, setting context, or evoking strong emotions.\n\n"
    f"Please consider the following factors when rating each image:\n"
    f"- **Relevance**: How directly does the image relate to the article's main content?\n"
    f"- **Visual Impact**: How much does the image contribute to the reader's emotional engagement or understanding of the article?\n"
    f"- **Context**: Is the image contextually appropriate, and does it fit well within the article's narrative?\n"
    f"- **Redundancy**: Avoid giving high ratings to images that repeat information or perspectives already provided by previous images.\n\n"
    f"Here is an example to guide your ratings:\n\n"
    f"**Article Heading**: {few_shot_example['heading']}\n"
    f"**Article URL**: {few_shot_example['article_url']}\n"
    f"**Content**: {few_shot_example['content']}\n\n"
    "### Images and their Alt Texts:\n"
)

for idx, image_info in enumerate(few_shot_example['images']):
    few_shot_prompt += (
        f"Image {idx + 1} URL: {image_info['image_url']}\n"
        f"Image {idx + 1} Alt Text: {image_info['image_alt']}\n"
    )

few_shot_prompt += f"\nTrue Labels: {few_shot_example['true_labels']}\n"

predicted_labels = []

for index, row in test_df.iterrows():
    prompt = (
        f"{few_shot_prompt}\n\n"
        f"Now, please rate the images for the following article in the same way (ONLY respond with the ratings - don't add any words):\n"
        f"Article Heading: {row['heading']}\n"
        f"Article URL: {row['article_url']}\n"
        f"Content: {row['content']}\n\n"
        "Images and their Alt Texts:\n"
    )

    for idx, image_info in enumerate(row['images']):
        prompt += (
            f"Image {idx + 1} URL: {image_info['image_url']}\n"
            f"Image {idx + 1} Alt Text: {image_info['image_alt']}\n"
        )

    prompt += "\nPlease provide your ratings:"

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an assistant that rates image importance in news articles."},
            {"role": "user", "content": prompt}
        ]
    )

    predicted_rating = response.choices[0].message.content.strip()
    cleaned_rating = predicted_rating.replace('[', '').replace(']', '')
    predicted_labels.append([int(x.strip()) for x in cleaned_rating.split(',')])


test_df.loc[:, 'pred_labels'] = predicted_labels

test_df.head()

Unnamed: 0,article_url,heading,images,content,true_labels,pred_labels
0,https://www.theguardian.com/music/article/2024...,‘Harry would approve’: rail firm protects Ches...,[{'image_url': 'https://i.guim.co.uk/img/media...,It has survived two world wars and nearly 200 ...,"[4, 3, 0, 0]","[4, 4, 3, 3]"
1,https://www.theguardian.com/us-news/article/20...,"Kamala Harris’s much-hyped, first big intervie...",[{'image_url': 'https://i.guim.co.uk/img/media...,Donald Trump spent Thursday in Michigan raving...,"[0, 2]","[0, 1]"
2,https://www.theguardian.com/global-development...,"‘When life gets hard, you must be harder’: run...",[{'image_url': 'https://i.guim.co.uk/img/media...,The town of Qaim lies on the border of Iraq an...,"[3, 2, 1]","[4, 4, 2]"
3,https://www.theguardian.com/society/article/20...,‘I don’t see how it’s enforceable’: pubgoers r...,[{'image_url': 'https://i.guim.co.uk/img/media...,"On an average weekend in Moseley, a suburb of ...","[3, 2, 0, 0]","[2, 3, 2, 2]"
4,https://www.theguardian.com/film/article/2024/...,"‘You laugh the hardest in grief’: And Mrs, the...",[{'image_url': 'https://i.guim.co.uk/img/media...,Finding the funny side of bereavement may be t...,"[4, 2]","[4, 4]"


In [133]:
def compute_rmse(df, tolerance=0):
    total_squared_errors = []

    for index, row in df.iterrows():
        true_labels = row['true_labels']
        pred_labels = row['pred_labels']
        
        if len(true_labels) != len(pred_labels):
            raise ValueError(f"Length of true_labels and pred_labels do not match for index {index}.")

        squared_errors = []
        for true_label, pred_label in zip(true_labels, pred_labels):
            difference = abs(pred_label - true_label)
            error = max(0, difference - tolerance)
            squared_errors.append(error ** 2)
        
        total_squared_errors.extend(squared_errors)

    mse = np.mean(total_squared_errors)
    rmse = np.sqrt(mse)
    
    return rmse / 4.0


In [164]:
compute_rmse(test_df, tolerance=1)

0.227429413073671

#### Categorization and Web Scraping (Official)