# Dataset Analysis

In [12]:
import pandas as pd
import numpy as np
import os
import re
import copy

In [11]:
base_dir = '../../data'

website_categories = {}

for website in os.listdir(base_dir):
    data = {}
    for category in os.listdir(os.path.join(base_dir, website)):
        data[category] = []
    website_categories[website] = data

In [26]:
dataset_insights = {
    'num_images': copy.deepcopy(website_categories),
    'num_comparisons': copy.deepcopy(website_categories),
    'image_size': copy.deepcopy(website_categories),
}

In [30]:
def sort_key(image_name):
    match = re.match(r'image_(\d+)_(\d+)', image_name)
    if match:
        x = int(match.group(1))  # Extract the first number (x)
        y = int(match.group(2))  # Extract the second number (y)
        return (x, y)  # Return a tuple for sorting

In [46]:
for website in os.listdir(base_dir):
    for category in os.listdir(os.path.join(base_dir, website)):
        df = pd.read_csv(os.path.join(base_dir, website, category, 'image_descriptions.csv'))
        df['article_number'] = [sort_key(file)[0] for file in df['image number'].dropna().tolist()]
        dataset_insights['num_images'][website][category] = df['article_number'].value_counts().astype(int).tolist()
        images = [os.path.join(base_dir, website, category, img) for img in os.listdir(os.path.join(base_dir, website, category)) if img.endswith('.jpg')]
        dataset_insights['image_size'][website][category] = [os.path.getsize(img) for img in images]
        dataset_insights['num_comparisons'][website][category] = len([(i, j) for i in df['article_number'].tolist() for j in df['article_number'].tolist() if i < j])

**Total no. of images**

In [47]:
num_images = 0

for website, categories in dataset_insights['num_images'].items():
    for category, images in categories.items():
        num_images += sum(images)

print(f'Total no. of images: {num_images}')

Total no. of images: 4264


**Average no. of images per article**

In [48]:
avg_num_image = []

for website, categories in dataset_insights['num_images'].items():
    for category, images in categories.items():
        avg_num_image += images

print(f'Average no. of images per article: {np.mean(avg_num_image)}')

Average no. of images per article: 1.7193548387096773


**Average image size**

In [55]:
avg_img_size = 0

for website, categories in dataset_insights['image_size'].items():
    for category, images in categories.items():
        avg_img_size += sum(images)

print(f'Average image size: {avg_img_size / (num_images)} bytes')

Average image size: 199157.14493433395 bytes


**Total no. of image pair comparisons**

In [57]:
num_comparisons = 0

for website, categories in dataset_insights['num_comparisons'].items():
    for category, count in categories.items():
        num_comparisons += count

print(f'Total no. of comparisons: {num_comparisons}')

Total no. of comparisons: 41031
