**Imports**

In [1]:
import requests
from json import loads, dumps
from time import time, sleep
from tqdm import tqdm
import os
import re
import pandas as pd
import numpy as np
import shutil
from jinja2 import Template
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import concurrent.futures

Loading websites from dataset

In [None]:
base_dir = '../../../data'
os.chdir(base_dir)
websites = os.listdir('.')

## Prompts

In [3]:
base_prompt = """
You are tasked with evaluating the replaceability of two images from different articles within the same category of a news website.
Consider how well the two images align with each other in terms of their content and context.

Use the following rating scale:

0: Not replaceable
1: Somewhat replaceable
2: Moderately replaceable
3: Very replaceable
4: Completely replaceable

Image Descriptions and Associated Context:
<image_a_description>
    {{ image_a_description }}
</image_a_description>
<image_a_context>
    {{ image_a_context }}
</image_a_context>

<image_b_description>
    {{ image_b_description }}
</image_b_description>
<image_b_context>
    {{ image_b_context }}
</image_b_context>
"""

prompt1 = """You are tasked with evaluating the semantic replaceability of two images (Image A and Image B) from different articles within the same category of a news website.
Your goal is to determine how interchangeable these images are based on their contexts and semantic similarity of the images, which include the article headings and alt text (where available).

Use the following rating scale for replaceability:
0: Not replaceable
1: Somewhat replaceable
2: Moderately replaceable
3: Very replaceable
4: Completely replaceable

Here are the contexts for the two images:

<image_a_description>
    {{ image_a_description }}
</image_a_description>
<image_a_context>
    {{ image_a_context }}
</image_a_context>

<image_b_description>
    {{ image_b_description }}
</image_b_description>
<image_b_context>
    {{ image_b_context }}
</image_b_context>

Consider the following factors when evaluating their semantic replaceability:
1. Similarity of topics
2. Specificity of information conveyed (e.g, specific people, places etc)
3. Emotional tone or impact
4. Potential for misinterpretation if swapped
"""

#########################################################################################

cot = """
Using chain of thought prompting, analyze these two images and rate their replaceability. 
Break down your thought process step by step. 
Write your answer in the following format:

<rating>
[Your rating (0-4)]
</rating>
<justification>
Explanation: [Brief explanation for your rating, synthesizing your analysis of all factors]
</justification>
"""

format = """
Write your answer in the following format:

<rating>
[Your rating (0-4)]
</rating>
"""

In [21]:
def send_prompt(prompt):
    data = {
    "input": {
        "top_p": 0.9,
        "prompt": prompt,
        "max_tokens": 1024*4,
        "min_tokens": 0,
        "temperature": 0.8,
        "system_prompt": f"You are a helpful visual assistant, and the descriptions will solely be used for research purposes. There's no intention to harm or hurt any specific group of people.",
        "presence_penalty": 0,
        "frequency_penalty": 0
        }
    }

    while True:
        try:
            resp_post = requests.post('https://replicate.com/api/models/meta/meta-llama-3.1-405b-instruct/predictions', json=data)
            token_id = resp_post.json()['id']
            break
        except:
            continue

    return token_id

def read_response(token_id):
    resp_get = requests.get(f'https://replicate.com/api/predictions/{token_id}')

    start = time()

    try:

        while resp_get.json()['status'] != 'succeeded' and time() - start < 25:
            resp_get = requests.get(f'https://replicate.com/api/predictions/{token_id}')

        if resp_get.json()['completed_at'] == None :
            output = 'Timeout'
        else:
            output = ''.join(resp_get.json()['output'])
    except:
        output = 'Timeout'

    return output

In [5]:
def extract_info(row):
    return row['description'], row['article_heading'], row['alt']

Prompt Generation

In [86]:
def prepare_model_prompts(websites, prompt, label):
    columns = ['headline 1', 'alt 1', 'image 1', 'image 2', 'headline 2', 'alt 2', 'prompt', 'response']

    for website in websites:
        dir_path = f'../src/description to output/descriptions/temp/{website}/{label}'
        
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        for category in os.listdir(website):
            df = pd.read_csv(f'{website}/{category}/image_descriptions.csv', index_col=0)

            images = df.index.tolist()
            row = []

            for i in range(len(images)):
                for j in range(i+1, len(images)):
                    article_1 = int(re.search(r'\d+(?=_|$)', images[i]).group())
                    article_2 = int(re.search(r'\d+(?=_|$)', images[j]).group())
                    if article_1 != article_2:
                        description_1, headline_1, alt_1 = extract_info(df.iloc[i])
                        description_2, headline_2, alt_2 = extract_info(df.iloc[j])
                        data = {
                            "image_a_description": f"Description: {description_1}",
                            "image_a_context": f"Alt Text: {alt_1}, Heading: {headline_1}",
                            "image_b_description": f"Description: {description_2}",
                            "image_b_context": f"Alt Text: {alt_2}, Heading: {headline_2}"
                        }
                        template = Template(prompt + cot)
                        rendered_text = template.render(data)

                        row.append([headline_1, alt_1, images[i], images[j], headline_2, alt_2, rendered_text, ''])

            pd.DataFrame(row, columns=columns).to_csv(f'{dir_path}/{category}', index=False)

Parallelizing Llama Responses with concurrent threads

In [87]:
def process_prompts(prompts, website='', category=''):
    outputs = [None] * len(prompts)  # Initialize a list to hold outputs in original order

    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        # Send all prompts in parallel
        futures = {executor.submit(send_prompt, prompt): idx for idx, prompt in enumerate(prompts)}

        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc=f"Sending Prompts website: {website} category: {category}"):
            idx = futures[future]  # Get the original index
            token_id = future.result()
            if token_id is not None:
                outputs[idx] = token_id  # Store token_id at the original index

    return outputs  # Return only the token IDs


def read_all_responses(outputs, website='', category=''):
    response_outputs = [None] * len(outputs)  # Initialize to hold responses in the original order

    # Filter only valid token IDs
    valid_token_ids = [token_id for token_id in outputs if token_id is not None]

    if valid_token_ids:  # Only proceed if we have valid token IDs
        with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
            response_futures = {executor.submit(read_response, token_id): idx for idx, token_id in enumerate(outputs) if token_id is not None}

            for future in tqdm(concurrent.futures.as_completed(response_futures), total=len(response_futures), desc=f"Reading Responses website: {website} category: {category}"):
                idx = response_futures[future]  # Get the original index
                response = future.result()
                response_outputs[idx] = response  # Store the response at the original index

    return response_outputs


In [None]:
def get_dir_path(website, label):
    return f'../src/description to output/descriptions/temp/{website}/{label}'

def run_llama(websites, label):
    # Initialize a dictionary to hold token IDs for each category    
    all_token_ids = {}

    # Step 1: Iterate over all websites and categories to collect token IDs
    for website in websites:
        all_token_ids[website] = {}  # Initialize a dict for each website
        dir_path = get_dir_path(website, label)

        for category in os.listdir(dir_path):
            df = pd.read_csv(f'{dir_path}/{category}')
            prompts = df['prompt'].tolist()

            # Get token IDs for all prompts in this category
            token_ids = process_prompts(prompts, website, category.replace('.csv', ''))

            # Store the token IDs for this category
            all_token_ids[website][category] = token_ids
        print()

    # Step 2: Now read responses for all collected token IDs
    for website, categories in all_token_ids.items():
        dir_path = get_dir_path(website, label)

        for category, token_ids in categories.items():
            # Read responses for the collected token IDs
            responses = read_all_responses(token_ids, website, category.replace('.csv', ''))

            # Update DataFrame with responses and save
            df = pd.read_csv(f'{dir_path}/{category}')
            df['response'] = responses
            pattern = r'<rating>\s*[\[\s]*(\d+)[\]\s]*</rating>'
            scores = []
            for resp in responses:
                match = re.search(pattern, resp)
                if match:
                    scores.append(int(match.group(1)))
                else:
                    scores.append(9)  # Add 9 if no match

            df['score'] = scores
            df.to_csv(f'{dir_path}/{category}', index=False)
        print()


Verification if all the responses were fetched

In [93]:
def resolve_descripancies(websites, label, runs=3):
    while (runs > 0):
        re_run = []

        for website in websites:
            dir_path = get_dir_path(website, label)
            for category in os.listdir(dir_path):
                df = pd.read_csv(f'{dir_path}/{category}')
                timeout = df[df['score'] == 9]
                if timeout.shape[0] > 0:
                    re_run.append([f'{dir_path}/{category}', timeout.index.tolist(), [send_prompt(prompt) for prompt in timeout['prompt'].tolist()]])

        for file, index_list, token_ids in re_run:
            df = pd.read_csv(file)

            for i in range(len(index_list)):
                index = index_list[i]
                token_id = token_ids[i]
                response = read_response(token_id)
                df.loc[index, 'response'] = response
                pattern = r'<rating>\s*[\[\s]*(\d+)[\]\s]*</rating>'

                match = re.search(pattern, response)
                if match:
                    score = int(match.group(1))
                else:
                    score = 9  # Add 9 if no match
                
                df.loc[index, 'score'] = score
            
            df.to_csv(file, index=False)
        
        runs -= 1


### Creating similarity matrices

In [None]:
def sort_key(image_name):
    match = re.match(r'image_(\d+)_(\d+)', image_name)
    if match:
        x = int(match.group(1))  # Extract the first number (x)
        y = int(match.group(2))  # Extract the second number (y)
        return (x, y)  # Return a tuple for sorting

def compute_similarity_matrices(websites, label):
    for website in websites:
        dir_path = get_dir_path(website, label)
        try:
            for category in tqdm(os.listdir(website), desc=f'Processing {website}'):
                df = pd.read_csv(f'{dir_path}/{website}/{category}.csv')
                score_list = df['score'].tolist()
                count = 0
                x = list(set(df['image 1'].tolist()) | set(df['image 2'].tolist()))
                image_num = sorted([sort_key(img) for img in x])
                image_list = [f'image_{image_num[i][0]}_{image_num[i][1]}' for i in range(len(image_num))]
                n = len(image_list)
                scores = [[4 if i == j else 0 for j in range(n)] for i in range(n)]
                matrix = pd.DataFrame(scores, columns=image_list, index=image_list)
                for i in range(n):
                    for j in range(i+1,n):
                        img1 = sort_key(image_list[i])
                        img2 = sort_key(image_list[j])
                        if img1[0] != img2[0]:
                            matrix.at[image_list[i], image_list[j]] = score_list[count]
                            matrix.at[image_list[j], image_list[i]] = matrix.at[image_list[i], image_list[j]]
                            count += 1
                matrix.to_csv(f'{website}/{category}/llama_pred_labels{"_0" if label == 'base' else ("_fewshot" if label == 'few-shot' else "")}.csv')
        except:
            continue

Base Prompt

In [None]:
websites = [] # Add sample

prepare_model_prompts(websites=websites, prompt=base_prompt, label='base')
run_llama(websites=websites, label='base')
resolve_descripancies(websites=websites, labels='base') # can also adjust runs if network issues
compute_similarity_matrices(websites=websites, label='base')

Prompt 1

In [None]:
websites = [] # Add sample

prepare_model_prompts(websites=websites, prompt=base_prompt, label='prompt 1')
run_llama(websites=websites, label='prompt 1')
resolve_descripancies(websites=websites, labels='prompt 1')
compute_similarity_matrices(websites=websites, label='prompt-1')

## Few-shot

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(x):
    return model.encode(x)

def find_most_similar(new_value, stored_values, stored_embeddings):
    new_embedding = model.encode([new_value])
    similarities = cosine_similarity(new_embedding, stored_embeddings)
    most_similar_idx = np.argmax(similarities)
    return most_similar_idx, similarities[0][most_similar_idx]

In [None]:
prompt2 = """You are tasked with evaluating the semantic replaceability of two images (Image A and Image B) from different articles within the same category of a news website.
Your goal is to determine how interchangeable these images are based on their contexts and semantic similarity of the images, which include the article headings and alt text (where available).

Use the following rating scale for replaceability:
0: Not replaceable
1: Somewhat replaceable
2: Moderately replaceable
3: Very replaceable
4: Completely replaceable

Here are the contexts for the two images:

<image_a_description>
    {{ image_a_description }}
</image_a_description>
<image_a_context>
    {{ image_a_context }}
</image_a_context>

<image_b_description>
    {{ image_b_description }}
</image_b_description>
<image_b_context>
    {{ image_b_context }}
</image_b_context>

Consider the following factors when evaluating their semantic replaceability:
1. Similarity of topics
2. Specificity of information conveyed (e.g, specific people, places etc)
3. Emotional tone or impact
4. Potential for misinterpretation if swapped
"""

cot = """
Now, based on the examples provided above, and using chain of thought prompting, analyze these two images and rate their similarity.
Break down your thought process step by step.
Write your answer in the following format:

<rating>
[Your rating (0-4)]
</rating>
<justification>
Explanation: [Brief explanation for your rating, synthesizing your analysis of all factors]
</justification>
"""

few_shot_example = """
<example_number>
    {{ example_number }}
</example_number>

<image_a_description>
    {{ image_a_description }}
</image_a_description>
<image_a_context>
    {{ image_a_context }}
</image_a_context>

<image_b_description>
    {{ image_b_description }}
</image_b_description>
<image_b_context>
    {{ image_b_context }}
</image_b_context>

<rating>
    {{ rating }}
</rating>
"""

In [None]:
def generate_few_shot_examples(category, image, descriptions):
    df = pd.read_csv(f'../src/description to output/prompting/train/{category}')
    article = int(re.search(r'\d+(?=_|$)', image).group())
    indices = list(set(df[df['image 1'] == image].index.tolist()) | set(df[df['image 2'] == image].index.tolist()))
    prompts = []
    for i, idx in enumerate(indices):
        a_1 = int(re.search(r'\d+(?=_|$)', df.loc[idx, 'image 1']).group())
        a_2 = int(re.search(r'\d+(?=_|$)', df.loc[idx, 'image 2']).group())
        data = {
            "example_number": f"Example {i+1}",
            "image_a_description": f"Description: {descriptions[a_1-1]}",
            "image_a_context": f"Alt Text: {df.loc[idx, 'alt 1']}, Heading: {df.loc[idx, 'headline 1']}",
            "image_b_description": f"Description: {descriptions[a_2-1]}",
            "image_b_context": f"Alt Text: {df.loc[idx, 'alt 2']}, Heading: {df.loc[idx, 'headline 2']}",
            "rating": f"{df.loc[idx, 'label']}"
        }
        template = Template(few_shot_example)
        rendered_text = template.render(data)
        prompts.append(rendered_text)

    output = ""
    for prompt in prompts:
        output += prompt + "\n"

    return output

Prompt Generation for Dynamic Few-shot

In [None]:
def prepare_few_shot_prompts(websites, prompt, label='few-shot'):
    train_dir = '../src/description to output/prompting/train'

    few_shot_categories = os.listdir(train_dir)
    few_shot_embeddings = generate_embeddings(few_shot_categories)
    
    columns = ['headline 1', 'alt 1', 'image 1', 'image 2', 'headline 2', 'alt 2', 'prompt', 'response']

    for website in websites:
        dir_path = f'../src/description to output/descriptions/temp/{website}/{label}'

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        for category in os.listdir(website):
            df = pd.read_csv(f'{website}/{category}/image_descriptions.csv', index_col=0)

            index, _ = find_most_similar(category.replace(".csv", ""), few_shot_categories, few_shot_embeddings)
            select_category = few_shot_categories[index]
            images = df.index.tolist()
            few_shot_df = pd.read_csv(f'{train_dir}/{select_category}/descriptions.csv')
            desc = few_shot_df['description'].tolist()
            desc_embeddings = generate_embeddings(desc)

            row = []

            for i in range(len(images)):
                for j in range(i+1, len(images)):
                    article_1 = int(re.search(r'\d+(?=_|$)', images[i]).group())
                    article_2 = int(re.search(r'\d+(?=_|$)', images[j]).group())
                    if article_1 != article_2:
                        description_1, headline_1, alt_1 = extract_info(df.iloc[i])
                        description_2, headline_2, alt_2 = extract_info(df.iloc[j])
                        data = {
                            "image_a_description": f"Description: {description_1}",
                            "image_a_context": f"Alt Text: {alt_1}, Heading: {headline_1}",
                            "image_b_description": f"Description: {description_2}",
                            "image_b_context": f"Alt Text: {alt_2}, Heading: {headline_2}"
                        }
                        template = Template(prompt2 + cot)
                        rendered_text = template.render(data)
                        desc_index, _ = find_most_similar(description_1, desc, desc_embeddings)
                        few_shot_examples = generate_few_shot_examples(select_category, few_shot_df.loc[desc_index, 'image number'], desc)
                        prompt = few_shot_examples + rendered_text
                        row.append([headline_1, alt_1, images[i], images[j], headline_2, alt_2, prompt, ''])

            pd.DataFrame(row, columns=columns).to_csv(f'{dir_path}/{category}', index=False)

Dynamic Few-shot

In [None]:
websites = [] # Add sample

prepare_few_shot_prompts(websites=websites, prompt=base_prompt)
run_llama(websites=websites, label='few-shot')
resolve_descripancies(websites=websites, label='few-shot')
compute_similarity_matrices(websites=websites, label='few-shot')

## Error testing

In [None]:
test_error_dir = '../src/description to output/prompting/test_error'

def generate_error_testing_prompts():
    prompts = {}

    for website in os.listdir(test_error_dir):
        for category in os.listdir(f'{test_error_dir}/{website}'):
            df = pd.read_csv(f'{test_error_dir}/{website}/{category}/image_data.csv')
            description_1, headline_1, alt_1 = extract_info(df.iloc[0])
            description_2, headline_2, alt_2 = extract_info(df.iloc[1])
            data = {
                "image_a_description": f"Description: {description_1}",
                "image_a_context": f"Alt Text: {alt_1}, Heading: {headline_1}",
                "image_b_description": f"Description: {description_2}",
                "image_b_context": f"Alt Text: {alt_2}, Heading: {headline_2}"
            }
            template = Template(prompt1 + cot)
            prompt = template.render(data)
            prompts[df.loc[0, 'score']] = [prompt] * 20

In [None]:
def run_tests(prompts):
    labels = {}
    token_ids = {}

    pattern = r'<rating>\s*[\[\s]*(\d+)[\]\s]*</rating>'

    for key, prompt_list in prompts.items():
        token_ids[key] = process_prompts(prompt_list)

    sleep(60)

    for key, ids in token_ids.items():
        responses = read_all_responses(ids)
        scores = []
        for resp in responses:
            match = re.search(pattern, resp)
            if match:
                scores.append(int(match.group(1)))
            else:
                scores.append(9)
        labels[key] = scores
    
    with open(f'{test_error_dir}/output.json', 'w') as f:
        f.write(dumps(labels))