In [2]:
import os
import json
import pandas as pd

# input folder 
path = "/Users/ipekgezer/dataverse_files"
# output folder 
os.makedirs("data", exist_ok=True)

country_summary_rows = []
country_year_rows = []

for folder in os.listdir(path):  # getting the files
    full_path = os.path.join(path, folder)
    if not os.path.isdir(full_path):  # checking if it's a folder or not
        continue

    country = folder[:3]  # folders' names' first three letters are shortened version of the countries' names
    img_path = os.path.join(full_path, "images.jsonl")
    news_path = os.path.join(full_path, "news.jsonl")


    if (not os.path.exists(img_path)) or (not os.path.exists(news_path)):  # checking if it exists
        continue
    if os.path.getsize(img_path) == 0 or os.path.getsize(news_path) == 0: # checking if it is empty or not
        continue

    id_to_year = {} #news id and year info
    with open(news_path, "r", encoding="utf-8") as nf: #news data
        for line in nf:
            try:
                news_obj = json.loads(line)
            except json.JSONDecodeError:
                continue

            n_id = news_obj.get("id")
            date_str = news_obj.get("date")  # "YYYY-MM-DD" 
            if not nid or not date_str:
                continue

            try:
                year = int(str(date_str)[:4]) #get the year
            except ValueError:
                continue

            id_to_year[n_id] = year

    
    year_stats = {} #year based dictionary 

    # country based 
    n_images_all = 0 #all image count
    n_images_people = 0 #images with at least 1 person in it
    total_male = 0 
    total_female = 0
    total_people = 0
    all_image_ratios = []


    with open(img_path, "r", encoding="utf-8") as imf: #images data 
        for line in imf:
            try:
                img_obj = json.loads(line)
            except json.JSONDecodeError:
                continue

            news_id = img_obj.get("news-id")
            year = id_to_year.get(news_id) 
            if year is None:
                continue  

            n_images_all += 1

            male = img_obj.get("male-count", 0) or 0
            female = img_obj.get("female-count", 0) or 0
            total = male + female

            # year dictionary 
            if year not in year_stats:
                year_stats[year] = {
                    "n_images_all": 0,
                    "n_images_people": 0,
                    "n_male": 0,
                    "n_female": 0,
                    "n_people": 0,
                    "image_ratios": []
                }

            year_stats[year]["n_images_all"] += 1

            if total == 0:
                continue

            n_images_people += 1
            total_male += male
            total_female += female
            total_people += total

            ratio = female / total
            all_image_ratios.append(ratio)

            #update dictionary 
            year_stats[year]["n_images_people"] += 1
            year_stats[year]["n_male"] += male
            year_stats[year]["n_female"] += female
            year_stats[year]["n_people"] += total
            year_stats[year]["image_ratios"].append(ratio)

    # year interval
    years = sorted(year_stats.keys())
    year_min = years[0] if years else None
    year_max = years[-1] if years else None

    # total country ratios
    female_ratio_totals = (total_female / total_people) if total_people > 0 else None
    female_ratio_meanimg = (sum(all_image_ratios) / len(all_image_ratios)) if all_image_ratios else None

    # country based total list
    country_summary_rows.append({
        "country": country,
        "year_min": year_min,
        "year_max": year_max,
        "n_images_all": n_images_all,
        "n_images_people": n_images_people,
        "n_male": total_male,
        "n_female": total_female,
        "n_people": total_people,
        "female_ratio_totals": female_ratio_totals,
        "female_ratio_meanimg": female_ratio_meanimg
    })

    # country and year based list
    for y in years:
        ys = year_stats[y]
        y_ratio_totals = (ys["n_female"] / ys["n_people"]) if ys["n_people"] > 0 else None
        y_ratio_meanimg = (sum(ys["image_ratios"]) / len(ys["image_ratios"])) if ys["image_ratios"] else None

        country_year_rows.append({
            "country": country,
            "year": y,
            "n_images_all": ys["n_images_all"],
            "n_images_people": ys["n_images_people"],
            "n_male": ys["n_male"],
            "n_female": ys["n_female"],
            "n_people": ys["n_people"],
            "female_ratio_totals": y_ratio_totals,
            "female_ratio_meanimg": y_ratio_meanimg
        })

# saving
df_country = pd.DataFrame(country_summary_rows).sort_values(["country"])
df_country_year = pd.DataFrame(country_year_rows).sort_values(["country", "year"])

df_country.to_csv("data/country_summary_with_year_range.csv", index=False)
df_country_year.to_csv("data/country_year_panel.csv", index=False)

print("Saved:")
print(" - data/country_summary_with_year_range.csv")
print(" - data/country_year_panel.csv")


Saved:
 - data/country_summary_with_year_range.csv
 - data/country_year_panel.csv
