# Wikipedia pageviews Silver
Executes by rundate.
Aggregates and ranks pageviews on a daily level.

In [None]:
run_date = "2025-03-11"
spark_name_base = "wikipedia_pageviews_silver"

## Standard stuff that should be a package

In [None]:
import datetime
import os

import pyspark
import requests  # type: ignore
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

# make sure we read the config and from the same location as a containerized version would.
os.environ["SPARK_CONF_DIR"] = "/opt/tfds/spark/conf"


def get_config(config_name):
    """Get config from tfds-config server."""
    config_server_url = os.environ.get("TFDS_CONFIG_URL")
    if config_server_url is None:
        config_server_url = "http://tfds-config:8005/api/configs"

    config_url = config_server_url + "/" + config_name

    print(f"retrieving {config_name} config from {config_url}")
    response = requests.get(config_url)
    response.raise_for_status()
    if response.json() is None:
        raise ValueError(f"Config '{config_name}' not found. config server response: {response.text}")
    cfg = response.json().get("config")
    if cfg is None:
        raise ValueError(
            f"Config '{config_name}' does not have a 'config' key. Config server response: {response.text}"
        )

    if config_name == "s3" and "TFDS_S3_URL" in os.environ.keys():
        cfg["url"] = os.environ["TFDS_S3_URL"]
    if config_name == "spark" and "TFDS_SPARK_MASTER_URL" in os.environ.keys():
        cfg["master_url"] = os.environ["TFDS_SPARK_MASTER_URL"]
    return cfg


def get_spark_session(app_name_base: str) -> SparkSession:
    """Get spark client for s3."""
    s3_cfg = get_config("s3")

    app_name = f"{app_name_base}-{datetime.datetime.now():%Y%m%d-%H%M%S}"

    conf = (
        pyspark.conf.SparkConf()
        .setAppName(app_name)
        # s3 secrets
        .set("spark.hadoop.fs.s3a.access.key", s3_cfg["access_key"])
        .set("spark.hadoop.fs.s3a.secret.key", s3_cfg["secret_key"])
        .set("spark.task.maxFailures", "1")
        # .setMaster("local[*]")
    )
    builder = pyspark.sql.SparkSession.builder.config(conf=conf)
    spark_session = configure_spark_with_delta_pip(builder).getOrCreate()

    return spark_session


def show_cfg(spark_session):
    """Print out the spark config."""
    cfg = spark_session.sparkContext.getConf().getAll()
    for key, value in cfg:
        if key in (
            "spark.submit.pyFiles",
            "spark.driver.extraJavaOptions",
            "park.app.initial.jar.urls",
            "spark.files",
            "spark.repl.local.jars",
            "spark.app.initial.file.urls" "spark.executor.extraJavaOption",
            "spark.app.initial.jar.urls" "spark.app.initial.file.urls",
        ):
            print(key)
            for csv in value.split(","):
                print("    " + str(csv))
        else:
            print(f"{key} = {value}")


def print_spark_info(sc: SparkSession):
    """Print some spark info."""
    cfg: pyspark.SparkConf = sc.sparkContext.getConf()
    print(f'==== spark app: {cfg.get("spark.app.name")} ====')
    print(f'Spark master: {cfg.get("spark.master")}')
    print(f'Delta lake location: {cfg.get("spark.sql.warehouse.dir")}')
    print(f'S3 endpoint: {cfg.get("spark.hadoop.fs.s3a.endpoint")}')

    dbs = sc.catalog.listDatabases()
    print("Databases:")
    for db in dbs:
        print(db.name)
        tables = sc.catalog.listTables(db.name)
        for tbl in tables:
            print(f"    {tbl.name}")

In [None]:
from pyspark.sql.functions import col, row_number
from pyspark.sql.functions import sum as _sum
from pyspark.sql.window import Window

spark = get_spark_session(spark_name_base)
aggregated_df = (
    spark.table("bronze.wikipedia_page_reads")
    .groupBy("date", "page_title", "country_code")
    .agg(
        _sum(col("count_views")).alias("total_count_views"),
    )
)

national_win = Window.partitionBy("date", "country_code").orderBy(col("total_count_views").desc())

ranked_df = aggregated_df.withColumn("national_rank", row_number().over(national_win))

final_df = ranked_df.filter(col("national_rank") <= 100).orderBy("date", "national_rank", "country_code")

spark.sparkContext.setLogLevel("ERROR")
spark.sql("CREATE DATABASE IF NOT EXISTS silver")
table_name = "silver.wikipedia_page_ranks_100"
(
    final_df.write.mode("overwrite")  # Options: 'overwrite', 'append', 'ignore', 'error' (default)
    .option("mergeSchema", "true")
    .format("delta")  # Options: 'parquet', 'csv', 'json', 'orc', etc.
    .partitionBy("date")
    .saveAsTable(table_name)
)
print_spark_info(spark)
spark.stop()
print("All done")