In [29]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from openai import OpenAI
import base64
from tqdm import tqdm
import time
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from dotenv import load_dotenv
from io import BytesIO
from PIL import Image

### Web scraping to download images per category in any given news website

In [30]:
def get_news_categories(url):
    # news categories (and associated href) fetched via nav components
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching the URL: {e}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    
    categories = []
    navs = soup.find_all('nav')

    for nav in navs:
        for link in nav.find_all('a'):
            category = link.get_text(strip=True)
            category_url = link.get('href')
            if category and category_url:
                categories.append((category, urljoin(url, category_url)))

    return categories

def create_directories(base_url, categories):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join("outputs", urlparse(base_url).netloc)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    
    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

    return base_dir

In [31]:
def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            response = requests.get(img_url)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except Exception as e:
        pass

def download_images(category_url, save_dir):
    # download all images of a particular category
    try:
        response = requests.get(category_url)
        response.raise_for_status()
    except Exception as e:
        print(f'Error fetching the category URL: {e}')
        return
    
    soup = BeautifulSoup(response.content, 'html.parser')
    images = soup.select('img[src]')

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for i, img in enumerate(images):
            img_url = img.get('src')
            if img_url and not img_url.startswith('data:'):
                img_url = urljoin(category_url, img_url)
                img_name = f'image_{i}.jpg'
                futures.append(executor.submit(download_image, img_url, save_dir, img_name))
        
        for future in as_completed(futures):
            future.result()


Change the base url to any news website for which you want to download images (for every category) 

Tried for: thegaurdian, time.com, tribune.pk

In [32]:
base_url = 'https://www.bbc.com/' #change this
categories = get_news_categories(base_url)

if not categories:
    print("No categories found.")
    
base_dir = create_directories(base_url, categories)
for category, category_url in tqdm(categories, desc='Downloading images for every category'):
    category_dir = os.path.join(base_dir, category)
    download_images(category_url, category_dir)

Downloading images for every category:  71%|███████▏  | 30/42 [00:27<00:11,  1.03it/s]

Error downloading the image: cannot identify image file <_io.BytesIO object at 0x1061bf740>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x10619b5b0>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x11083f3d0>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x1061bf330>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x10619b560>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x10619a110>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x10619a200>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x1061bff60>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x106581d50>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x106583510>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0

Downloading images for every category:  76%|███████▌  | 32/42 [00:28<00:07,  1.34it/s]

Error downloading the image: cannot identify image file <_io.BytesIO object at 0x11082bdd0>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x107c5aac0>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x107c59cb0>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x105f60f40>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x107ced990>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x11082b7e0>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x105f61990>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x11082bba0>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x11082b650>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0x10619b560>
Error downloading the image: cannot identify image file <_io.BytesIO object at 0

Downloading images for every category:  95%|█████████▌| 40/42 [00:31<00:00,  3.54it/s]

Error downloading the image: cannot identify image file <_io.BytesIO object at 0x1113e29d0>


Downloading images for every category: 100%|██████████| 42/42 [00:31<00:00,  1.32it/s]


### Feeding images to GPT-4o (chat completion module) for their similarity scores

Here we first supply the 'training' similarity scores in the form of a matrix for the images given in the train folder<br>The matrix for n images is an n x n *symmetric* matrix with (nC2) comparisons
e.g,
|       | img0  | img1  | 
|-------|-------|-------|
| **img0** | 1.0   | x  | 
| **img1** | x  | 1.0   | 


This similarity matrix serves as the few shot example training for the LLM<br>Feel free to change/add to the training images in the train folder and redefine the training similarity matrix

**Note: api calls charged per usage ~0.5 usd for every 5 image comparisons (5x5 simlarity matrix - 10 comparisons) 

In [19]:
def extract_number(filename):
    match = re.search(r'image_(\d+)', filename)
    return int(match.group(1)) if match else float('inf')

train_images = sorted(
    [os.path.join('train', f) for f in os.listdir('train') if f.endswith(('jpg', 'jpeg', 'png'))],
    key=extract_number
)

# define your similarity score labels for training
similarity_scores = [
    [1.0, 0.75, 0.3, 0.3],
    [0.75, 1.0, 0.4, 0.4],
    [0.3, 0.4, 1.0, 0.95],
    [0.3, 0.4, 0.95, 1.0]
]

In [4]:
load_dotenv()
openai_api_key = os.getenv('api_key')
api_key = openai_api_key

In [None]:
##### You can find the api key on the slack channel or use your own api key
client = OpenAI(api_key=api_key)

def load_image_as_base64(image_path):
    with open(image_path, 'rb') as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

few_shot_examples = []

# create few shot training with sample question answering
for i in range(len(train_images)):
    for j in range(i + 1, len(train_images)):
        base64_img1 = load_image_as_base64(train_images[i])
        base64_img2 = load_image_as_base64(train_images[j])
        
        few_shot_examples.extend([
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_img1}"}
            },
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_img2}"}
            },
            {
                "type": "text",
                "text": f"Question: Compute semantic similarity score for the above pair of images.\nAnswer: {similarity_scores[i][j]}"
            }
        ])

# system role to specify the answer content - feel free to prompt engineer here
system_message = [
    {"role": "system", "content": "You need to assign similarity scores to pairs of images based on the main content and context of the image focusing on actions, emotions, and overall meaning and NOT on specific visual details such as colors or specific objects. When answering, ONLY give a numeric similarity score. Follow these examples:"},
    {"role": "user", "content": few_shot_examples}
]

In [22]:
def compare_images(image_paths):
    # compute similarity scores given the prompt above for all combinations of all images passed
    user_messages = []
    
    for i in range(len(image_paths)):
        for j in range(i + 1, len(image_paths)):
            base64_img1 = load_image_as_base64(image_paths[i])
            base64_img2 = load_image_as_base64(image_paths[j])
            
            user_messages.append({
                "role": "user",
                "content": [
                    {"type": "text", "text": "Compare these two images for semantic similarity."},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_img1}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_img2}"}}
                ]
            })

    responses = []
    for message in tqdm(user_messages, desc="Processing image pairs"):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=system_message + [message],
            max_tokens=300
        )
        time.sleep(1)
        responses.append(response.choices[0].message.content)
    
    return responses

In [23]:
def make_matrix(image_paths, similarity_scores):
    n = len(image_paths)
    similarity_matrix = [[1 if i == j else 0 for j in range(n)] for i in range(n)]

    idx = 0
    for i in range(n):
        for j in range(i+1, n):
            similarity_matrix[i][j] = similarity_scores[idx]
            similarity_matrix[j][i] = similarity_scores[idx]
            idx += 1

    return similarity_matrix

def process_categories(base_dir):
    category_folders = [os.path.join(base_dir, d) for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    matrices = {}

    for category_dir in category_folders:
        image_paths = sorted(
            [os.path.join(category_dir, f) for f in os.listdir(category_dir) if f.endswith(('jpg', 'jpeg', 'png'))],
            key=extract_number
        )
        if len(image_paths) < 2:
            continue

        similarity_scores = compare_images(image_paths)
        similarity_matrix = make_matrix(image_paths, similarity_scores)
        
        matrices[category_dir] = similarity_matrix

    with open(f'{base_dir}.json', 'w') as json_file:
        json.dump(matrices, json_file)

Computing the similairty matrix for each category in the test folder and saving the results to test.json<br>
Feel free to add to / remove from the test folder - **I first generate images using the web scraping module above and then only pick 2-3 categories with 4-5 images each to test because of the cost of api usage**

***** Please only run the below cell for *new* images added to the test folder because previous results have already been computed and kept in the test.json file - rerunning for the same images will only cost more without any benefit

In [None]:
folder_to_test = 'test'
process_categories(folder_to_test)