In [174]:
import pandas as pd
import numpy as np
import re
import os
import itertools
from json import dumps, loads

In [118]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/Shared_Caching')
except:
    pass

## Converts Danish and Ayain's labels into matrix form

In [144]:
def sort_key(image_name):
    match = re.match(r'image_(\d+)_(\d+)', image_name)
    if match:
        x = int(match.group(1))  # Extract the first number (x)
        y = int(match.group(2))  # Extract the second number (y)
        return (x, y)  # Return a tuple for sorting

In [121]:
images_list = {}

for website in os.listdir('output/images/'):
    data = {}
    for category in os.listdir(f'output/images/{website}'):
        df = pd.read_csv(f'output/images/{website}/{category}/labels.csv')
        images = df['image number'].tolist()
        N = len(images)
        data[category] = {(images[i], images[j]): (4 if i == j else 0) for i in range(N) for j in range(N)}
    images_list[website] = data


In [146]:
image_labels = {}

for website, categories in images_list.items():
    data = {}
    for category, images in images_list[website].items():
        df = pd.read_csv(f'output/labels/{website}/{category}.csv', names=['Pair 1', 'Image 1', 'Image 2', 'Pair 2', 'Score'], index_col=0)
        df.dropna(subset=['Score'], inplace=True)
        df = df[df['Pair 1'].str.startswith('https')]
        df.drop(columns=['Image 1', 'Image 2'], inplace=True)
        image_1 = df['Pair 1'].str.split('/').str[-1]
        image_2 = df['Pair 2'].str.split('/').str[-1]
        df['Pair 1'] = image_1
        df['Pair 2'] = image_2
        data[category] = sorted(list(set(df['Pair 1'].unique()) | set(df['Pair 2'].unique())), key=sort_key)
        tuples_list = df.apply(lambda row: (row['Pair 1'], row['Pair 2'], row['Score']), axis=1).tolist()
        for img_1, img_2, score in tuples_list:
            images_list[website][category][(img_1, img_2)] = score
        df.reset_index(drop=True, inplace=True)
    image_labels[website] = data

In [150]:
def compute_matrix(scores):
    N = int(len(scores)**0.5)
    matrix = [[0] * N for i in range(N)]
    count = 0

    for i in range(N):
        for j in range(N):
            matrix[i][j] = int(scores[count])
            count += 1
    return matrix


In [151]:
similarity_matrices = {}

for website, categories in images_list.items():
    matrix = {}
    for category, images in images_list[website].items():
        matrix[category] = compute_matrix(list(images.values()))
    similarity_matrices[website] = matrix

In [152]:
df = pd.DataFrame(similarity_matrices['english.elpais.com']['U.S'], columns=image_labels['english.elpais.com']['U.S'], index=image_labels['english.elpais.com']['U.S'])

The generated matrices are saved in the directory `similarity_matrices`

## Just save your labeled matrices in the directory `output/similarity_matrices/{website}/{category}.csv` and then run the cells below

In [158]:
data_frames = {}

for website, categories in images_list.items():
    frame = {}
    base_dir = f'output/similarity_matrices/{website}'
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    for category, images in images_list[website].items():
        frame[category] = pd.DataFrame(similarity_matrices[website][category], columns=image_labels[website][category], index=image_labels[website][category])
        frame[category].to_csv(f'output/similarity_matrices/{website}/{category}.csv')
    data_frames[website] = frame

In [189]:
images_scores = {}

for website in os.listdir('output/similarity_matrices'):
    data = {}
    for category in os.listdir(f'output/similarity_matrices/{website}'):
        df = pd.read_csv(f'output/similarity_matrices/{website}/{category}', index_col=0)
        labels = df.columns.tolist()
        data[category.replace('.csv', '')] = []
        for i in range(len(labels)):
            for j in range(i+1,len(labels)):
                article_1 = re.search(r'image_(\d+)_\d+', labels[i]).group(1)
                article_2 = re.search(r'image_(\d+)_\d+', labels[j]).group(1)
                if article_1 != article_2:
                    data[category.replace('.csv', '')].append(df.iloc[i,j])
    images_scores[website] = data