In [1]:
import subprocess
import json
from functools import reduce
from operator import add
from statistics import mean

In [2]:
years = list(i for i in range(2003, 2024))

In [3]:
def extract_year(y):
    proc = subprocess.run(["bun", f"./arwu/processed/arwu_processed_{y}.js"], capture_output=True)
    data = json.loads(proc.stdout.decode())
    return reduce(add, data, [])

In [4]:
mapping = {
  "Israel": "ISR",
  "Lithuania": "LTU",
  "Australia": "AUS",
  "China-Taiwan": "TAP",
  "Romania": "ROU",
  "South Korea": "KOR",
  "South Africa": "ZAF",
  "France": "FRA",
  "India": "IND",
  "Uganda": "UGA",
  "Ghana": "GHA",
  "United Arab Emirates": "ARE",
  "Czech Republic": "CZE",
  "Jordan": "JOR",
  "Austria": "AUT",
  "Costa Rica": "CRI",
  "New Zealand": "NZL",
  "Italy": "ITA",
  "Greece": "GRC",
  "Egypt": "EGY",
  "Japan": "JPN",
  "Singapore": "SGP",
  "Colombia": "COL",
  "Mexico": "MEX",
  "Nigeria": "NGA",
  "Slovenia": "SVN",
  "Luxembourg": "LUX",
  "Oman": "OMN",
  "Tunisia": "TUN",
  "Cyprus": "CYP",
  "Ethiopia": "ETH",
  "United Kingdom": "GBR",
  "Turkey": "TUR",
  "Poland": "POL",
  "Sweden": "SWE",
  "Puerto Rico": "USA",
  "Bulgaria": "BGR",
  "Estonia": "EST",
  "Norway": "NOR",
  "United States": "USA",
  "Hungary": "HUN",
  "Brazil": "BRA",
  "Spain": "ESP",
  "Malaysia": "MYS",
  "Argentina": "ARG",
  "Thailand": "THA",
  "Ireland": "IRL",
  "Vietnam": "VNM",
  "China-Hong Kong": "HKG",
  "China": "CHN",
  "Belgium": "BEL",
  "Saudi Arabia": "SAU",
  "Croatia": "HRV",
  "Germany": "DEU",
  "Netherlands": "NLD",
  "Denmark": "DNK",
  "Iceland": "ISL",
  "Slovakia": "SVK",
  "Switzerland": "CHE",
  "Finland": "FIN",
  "Qatar": "QAT",
  "Uruguay": "URY",
  "Portugal": "PRT",
  "Russia": "RUS",
  "Canada": "CAN",
  "Serbia": "SRB",
  "Chile": "CHL",
  "Iran": "IRN",
  "China-Macau": "MAC",
  "Lebanon": "LBN",
  "Pakistan": "PAK"
}


In [5]:
def transform(year):
    def f(item):
        rank = item["ranking"]
        if "-" in rank:
            parsed_rank = int(mean(map(int, rank.split("-"))))
        else:
            parsed_rank = int(rank)
        return {"rank": parsed_rank, "univ_name": item["univNameEn"], "country": mapping[item["region"]], "score": item["score"], "year": year}
    return f

In [6]:
results = []

In [7]:
for y in years:
    year_result = list(map(transform(y), extract_year(y)))
    results += year_result

In [8]:
import pandas as pd

In [9]:
df = pd.DataFrame(results)

In [10]:
df.describe()

Unnamed: 0,rank,score,year
count,13816.0,2104.0,13816.0
mean,367.615518,36.65846,2014.708382
std,252.799107,13.314495,6.179778
min,1.0,23.5,2003.0
25%,175.0,27.8,2009.0
50%,350.0,31.9,2016.0
75%,456.0,40.9,2020.0
max,950.0,100.0,2023.0


In [15]:
df.to_parquet("data/arwu_2003-2023.parquet")