# Wikipedia pageviews Silver
Executes by rundate.
Aggregates and ranks pageviews on a daily level.

In [None]:
import datetime as dt

execution_date = "2025-05-15T01:00:00+00:00"
execution_id = f"wikipedia_pageviews_silver-{dt.datetime.now():%Y%m%d-%H0000}"
full_refresh = False
silver_db = "silver"
bronze_db = "bronze"

print(f"Starting notebook execution: {execution_id}")

In [None]:
import os

from freeds.spark import get_spark_session
from freeds.utils import date_range
from pyspark.sql.functions import col, row_number
from pyspark.sql.functions import sum as _sum
from pyspark.sql.window import Window

if not os.environ.get("SPARK_CONF_DIR"):
    os.environ["SPARK_CONF_DIR"] = "/opt/tfds/spark/conf"


source_table_name = f"{bronze_db}.wikipedia_page_reads"
target_table_name = f"{silver_db}.wikipedia_page_ranks_100"

spark = get_spark_session(execution_id)

partition_list = date_range(execution_date=execution_date, length=2)

filtered_df = spark.table(source_table_name).filter(col("date").isin(partition_list))

aggregated_df = filtered_df.groupBy("date", "page_title", "country_code").agg(
    _sum(col("count_views")).alias("total_count_views"),
)

national_win = Window.partitionBy("date", "country_code").orderBy(col("total_count_views").desc())

ranked_df = aggregated_df.withColumn("national_rank", row_number().over(national_win))

final_df = ranked_df.filter(col("national_rank") <= 100).orderBy("date", "national_rank", "country_code")

spark.sparkContext.setLogLevel("ERROR")
spark.sql(f"CREATE DATABASE IF NOT EXISTS {silver_db}")
if not full_refresh:
    spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
(
    final_df.write.mode("overwrite")  # Options: 'overwrite', 'append', 'ignore', 'error' (default)
    .option("mergeSchema", "true")
    .format("delta")  # Options: 'parquet', 'csv', 'json', 'orc', etc.
    .partitionBy("date")
    .saveAsTable(target_table_name)
)
spark.stop()
print(f"All done: {execution_id}")