In [12]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from openai import OpenAI
import base64
from tqdm import tqdm
import time
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from dotenv import load_dotenv
from io import BytesIO
from PIL import Image
import numpy as np
import pandas as pd
import csv

### Web scraping to download images per category in any given news website

In [None]:
def get_news_categories(url):
    # news categories (and associated href) fetched via nav components
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching the URL: {e}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    
    categories = []
    navs = soup.find_all('nav')

    for nav in navs:
        for link in nav.find_all('a'):
            category = link.get_text(strip=True)
            category_url = link.get('href')
            if category and category_url:
                categories.append((category, urljoin(url, category_url)))

    return categories

def create_directories(base_url, categories):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join("outputs", urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    
    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [None]:
def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            response = requests.get(img_url)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except:
        pass

def download_images(category_url, save_dir):
    # download all images of a particular category
    try:
        response = requests.get(category_url)
        response.raise_for_status()
    except Exception as e:
        print(f'Error fetching the category URL: {e}')
        return
    
    soup = BeautifulSoup(response.content, 'html.parser')
    images = soup.select('img[src]')

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for i, img in enumerate(images):
            img_url = img.get('src')
            if img_url and not img_url.startswith('data:'):
                img_url = urljoin(category_url, img_url)
                img_name = f'image_{i}.jpg'
                futures.append(executor.submit(download_image, img_url, save_dir, img_name))
        
        for future in as_completed(futures):
            future.result()


Change the base url to any news website for which you want to download images (for every category) 

Tried for: thegaurdian, time.com, tribune.pk

In [None]:
base_url = 'https://www.washingtonpost.com/' #change this
categories = get_news_categories(base_url)

if not categories:
    print("No categories found.")
    
base_dir = create_directories(base_url, categories)
for category, category_url in tqdm(categories, desc='Downloading images for every category'):
    category_dir = os.path.join(base_dir, category)
    download_images(category_url, category_dir)

### Feeding images to GPT-4o (chat completion module) for their similarity scores

Here we first supply the 'training' similarity scores in the form of a matrix for the images given in the train folder<br>The matrix for n images is an n x n *symmetric* matrix with (nC2) comparisons
e.g,
|       | img0  | img1  | 
|-------|-------|-------|
| **img0** | 1.0   | x  | 
| **img1** | x  | 1.0   | 


This similarity matrix serves as the few shot example training for the LLM<br>Feel free to change/add to the training images in the train folder and redefine the training similarity matrix

**Note: api calls charged per usage ~0.5 usd for every 5 image comparisons (5x5 simlarity matrix - 10 comparisons) 

In [16]:
def extract_number(filename):
    match = re.search(r'image_(\d+)', filename)
    return int(match.group(1)) if match else float('inf')

train_images = sorted(
    [os.path.join('train', f) for f in os.listdir('train') if f.endswith(('jpg', 'jpeg', 'png'))],
    key=extract_number
)

# define your similarity score labels for training
similarity_scores = [
    [1.0, 0.7, 0.3, 0.3],
    [0.7, 1.0, 0.4, 0.4],
    [0.3, 0.4, 1.0, 0.9],
    [0.3, 0.4, 0.9, 1.0]
]

In [17]:
load_dotenv()
openai_api_key = os.getenv('api_key')
api_key = openai_api_key

In [41]:
##### You can find the api key on the slack channel or use your own api key
client = OpenAI(api_key=api_key)

def load_image_as_base64(image_path):
    with open(image_path, 'rb') as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

few_shot_examples = []

# create few shot training with sample question answering
for i in range(len(train_images)):
    for j in range(i + 1, len(train_images)):
        base64_img1 = load_image_as_base64(train_images[i])
        base64_img2 = load_image_as_base64(train_images[j])
        
        few_shot_examples.extend([
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_img1}"}
            },
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_img2}"}
            },
            {
                "type": "text",
                "text": f"Question: Compute semantic similarity score for the above pair of images.\nAnswer: {similarity_scores[i][j]}"
            }
        ])

# system role to specify the answer content - feel free to prompt engineer here
system_message = [
    {"role": "system", "content": "You need to assign similarity scores between 0 and 1 to pairs of images based on the main content and context of the image focusing on actions, emotions, and overall meaning and NOT on specific visual details such as colors or specific objects. Your response should ONLY contain the similairty score number (no words/phrases). Follow the examples below:"},
    {"role": "user", "content": few_shot_examples}
]

In [58]:
def compare_images(image_paths):
    # compute similarity scores given the prompt above for all combinations of all images passed
    user_messages = []
    
    for i in range(len(image_paths)):
        for j in range(i + 1, len(image_paths)):
            base64_img1 = load_image_as_base64(image_paths[i])
            base64_img2 = load_image_as_base64(image_paths[j])
            
            user_messages.append({
                "role": "user",
                "content": [
                    {"type": "text", "text": "Compare these two images for semantic similarity."},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_img1}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_img2}"}}
                ]
            })

    responses = []
    for message in tqdm(user_messages, desc="Processing image pairs"):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=system_message + [message],
            max_tokens=300
        )
        time.sleep(1)
        res = response.choices[0].message.content
        score = re.findall(r'-?\d*\.?\d+', res)[0]
        print(score)
        responses.append(score)
    
    return responses

In [59]:
def make_matrix(image_paths, similarity_scores):
    n = len(image_paths)
    similarity_matrix = [[1.0 if i == j else 0.0 for j in range(n)] for i in range(n)]

    idx = 0
    for i in range(n):
        for j in range(i + 1, n):
            similarity_matrix[i][j] = float(similarity_scores[idx])
            similarity_matrix[j][i] = float(similarity_scores[idx])
            idx += 1

    return similarity_matrix

def process_categories(base_dir):
    category_folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

    for category_dir in category_folders:
        category_path = os.path.join(base_dir, category_dir)
        image_paths = sorted(
            [os.path.join(category_path, f) for f in os.listdir(category_path) if f.endswith(('jpg', 'jpeg', 'png'))],
            key=extract_number
        )
        if len(image_paths) < 2:
            continue

        similarity_scores = compare_images(image_paths)
        similarity_matrix = make_matrix(image_paths, similarity_scores)

        with open(f'pred_labels/with_fewshot/{category_dir}.csv', 'w', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerows(similarity_matrix)

In [67]:
def read_matrix_from_csv(filepath):
    with open(filepath, 'r') as csv_file:
        reader = csv.reader(csv_file)
        matrix = [list(map(float, row)) for row in reader]
    return np.array(matrix)

def compute_rmse(pred_folder, labels_folder, tolerance=0.1):
    test_files = sorted([f for f in os.listdir(pred_folder) if f.endswith('.csv')])
    labels_files = sorted([f for f in os.listdir(labels_folder) if f.endswith('.csv')])

    if len(test_files) != len(labels_files):
        raise ValueError("The number of files in test_folder and labels_folder does not match.")

    total_squared_errors = []

    for test_file in test_files:
        label_file = test_file
        test_path = os.path.join(pred_folder, test_file)
        label_path = os.path.join(labels_folder, label_file)

        if not os.path.isfile(test_path) or not os.path.isfile(label_path):
            raise ValueError(f"Corresponding file for '{test_file}' is missing in one of the folders.")

        test_matrix = read_matrix_from_csv(test_path)
        label_matrix = read_matrix_from_csv(label_path)

        if test_matrix.shape != label_matrix.shape:
            raise ValueError(f"Matrix shapes for '{test_file}' do not match.")

        squared_errors = []
        for row_test, row_label in zip(test_matrix, label_matrix):
            for score_test, score_label in zip(row_test, row_label):
                difference = abs(score_test - score_label)
                error = max(0, difference - tolerance)
                squared_errors.append(error ** 2)

        total_squared_errors.extend(squared_errors)

    mse = np.mean(total_squared_errors)
    rmse = np.sqrt(mse)
    return rmse


In [None]:
def write_matrix_to_csv(output_file, avg_matrix):
    with open(output_file, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerows(avg_matrix)

def compute_average_matrix(matrix1, matrix2):
    return np.round((matrix1 + matrix2) / 2, 1)

def extract_base_filename(filename):
    base_name, _ = os.path.splitext(filename) 
    return re.sub(r'\d+$', '', base_name)

def process_labels(base_dir):
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        
        if not os.path.isdir(category_path):
            continue
        
        files = sorted([f for f in os.listdir(category_path) if f.endswith('.csv')])

        for i in range(0, len(files), 2):
            file1 = files[i]
            file2 = files[i+1] if i+1 < len(files) else None
            
            if file2 is None:
                continue
            
            matrix1 = read_matrix_from_csv(os.path.join(category_path, file1))
            matrix2 = read_matrix_from_csv(os.path.join(category_path, file2))
            
            avg_matrix = compute_average_matrix(matrix1, matrix2)

            output_file = f'true_labels/{category}_{extract_base_filename(file1)}.csv'
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            
            write_matrix_to_csv(output_file, avg_matrix)
    
    print('Average similarity matrices(labels) written to files')

process_labels('true_labels')

Computing the similairty matrix for each category in the test folder and saving the results to test.json<br>
Feel free to add to / remove from the test folder - **I first generate images using the web scraping module above and then only pick 2-3 categories with 4-5 images each to test because of the cost of api usage**

***** Please only run the below cell for *new* images added to the test folder because previous results have already been computed and kept in the test.json file - rerunning for the same images will only cost more without any benefit

In [None]:
test_dir = 'test'
process_categories(test_dir)

In [70]:
rmse = compute_rmse(pred_folder = 'pred_labels/with_fewshot', labels_folder = 'true_labels', tolerance=0.1)
print(f'Total RMSE: {rmse}')

Total RMSE: 0.10996211468804457


In [21]:
import pingouin as pg

def flatten_upper_triangle(matrix):
    return matrix[np.triu_indices_from(matrix, 1)]

def icc(flat_matrices_by_rater):
    df = pd.DataFrame(flat_matrices_by_rater)
    
    df['Target'] = df.index 
    df_long = pd.melt(df, id_vars='Target', var_name='Rater', value_name='Similarity')
    
    icc_results = pg.intraclass_corr(data=df_long, targets='Target', raters='Rater', ratings='Similarity')
    icc_value = icc_results.iloc[0]['ICC']
    
    return round(icc_value, 2)

def find_icc_per_category(base_dir):
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        
        if not os.path.isdir(category_path):
            continue
        
        files = sorted([f for f in os.listdir(category_path) if f.endswith('.csv')])

        matrices_by_rater = {}
        
        for file in files:
            filename, _ = os.path.splitext(file)
            rater = filename[-1]
            
            if rater not in matrices_by_rater:
                matrices_by_rater[rater] = []
            
            matrix = read_matrix_from_csv(os.path.join(category_path, file))
            matrices_by_rater[rater].append(matrix)
        
        flat_matrices_by_rater = {}
        
        for rater, matrices in matrices_by_rater.items():
            flat_matrices = [flatten_upper_triangle(matrix) for matrix in matrices]
            flat_data = np.concatenate(flat_matrices)
            flat_matrices_by_rater[f'Rater_{rater}'] = flat_data
        
        if len(flat_matrices_by_rater) > 1:
            inter_rater_compatibility = icc(flat_matrices_by_rater)
            print(f'Category: {category}')
            print(f'Inter-rater reliability (ICC): {inter_rater_compatibility}')
        else:
            print(f'Not enough matrices to compute ICC for category: {category}')

find_icc_per_category('true_labels')

Category: washington
Inter-rater reliability (ICC): 0.38
Category: bbc
Inter-rater reliability (ICC): 0.46


#### Value Score

In [None]:
# WORK IN PROGRESS

def sanitize_filename(filename):
    return "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_')).rstrip()

def get_article_links(page_url):
    try:
        response = requests.get(page_url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching page URL: {e}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    article_links = []

    for link in soup.find_all('a', href=True):
        href = link['href']
        if 'article' in href or '/news/' in href:
            article_links.append(urljoin(page_url, href))

    return article_links

def download_article_content(article_url, save_dir):
    try:
        response = requests.get(article_url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching article URL: {e}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('title').get_text(strip=True) if soup.find('title') else 'Untitled'
    title_sanitized = sanitize_filename(title)
    article_dir = os.path.join(save_dir, title_sanitized)

    if not os.path.exists(article_dir):
        os.makedirs(article_dir)

    with open(os.path.join(article_dir, 'title.txt'), 'w', encoding='utf-8') as f:
        f.write(title)

    images = soup.select('img[src]')
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for i, img in enumerate(images):
            img_url = img.get('src')
            if img_url:
                img_url = urljoin(article_url, img_url)
                img_name = f'image_{i + 1}.jpg'
                futures.append(executor.submit(download_image, img_url, article_dir, img_name))
        
        for future in as_completed(futures):
            future.result()
