# Extract: Wikipedia pageviews

Takes the airflow execution date and scan wikipedia data download for files for last <overlap_hours> hours. All files not already retreieved will be downloaded and unpacked.

Files will be put in a folder named: `output_dir/year/month/day/`

Files are named `yyyymmdd_hhmm_type.csv`

#### Parameters:

`execution_date_str` = the airflow exeuction date

`output_root` = path to directory where this pipeline will create the yearly folders with sublevels 

`overlap_hours` = how many hours back in time should we look for files, sometimes it takes a while for the data to be created in the source. 

In [None]:
import datetime as dt

# airflow execution date
execution_date = "2025-06-01T01:00:00+00:00"
execution_id = f"wikipedia_pageviews_bronze-{dt.datetime.now():%Y%m%d-%H0000}"

output_bucket = "data"
output_root_prefix = "wikipedia_pageviews"
overlap_hours = 3
force_reupload = False
print(f"Starting notebook execution: {execution_id}")

In [None]:
import datetime
import os
from urllib import request

import requests  # type: ignore[import-untyped]
from freeds.s3 import file_exists, put_file
from freeds.utils import parse_execution_date

print(f"execution_date: {execution_date}")
print(f"output_bucket: {output_bucket}")
print(f"output_root_prefix: {output_root_prefix}")
print(f"overlap_hours: {overlap_hours}")

execution_hour = parse_execution_date(execution_date)

print(f"execution_hour: {execution_hour}")

tmp_file_name = "/tmp/tmp.gz"
for retrieve_hour in (execution_hour - datetime.timedelta(hours=i) for i in range(overlap_hours)):
    output_prefix = (
        f"{output_root_prefix}/"
        f"{retrieve_hour.year}/"
        f"{retrieve_hour.year}-{retrieve_hour.month:0>2}/"
        f"{retrieve_hour.day:0>2}"
    )
    output_key = (
        f"{output_prefix}/"
        f"pageviews-{retrieve_hour.year}{retrieve_hour.month:0>2}{retrieve_hour.day:0>2}-{retrieve_hour.hour:0>2}0000"
        ".gz"
    )

    if not force_reupload and file_exists(bucket_name=output_bucket, file_name=output_key):
        print(f"file already downloaded: {output_key}")
        continue

    url = (
        "https://dumps.wikimedia.org/other/pageviews/"
        f"{retrieve_hour.year}/{retrieve_hour.year}-{retrieve_hour.month:0>2}/"
        f"pageviews-{retrieve_hour.year}{retrieve_hour.month:0>2}{retrieve_hour.day:0>2}-{retrieve_hour.hour:0>2}0000"
        ".gz"
    )

    response = requests.head(url)
    if response.status_code != 200:
        print(f"no file found for: {retrieve_hour} at {url}")
        continue

    remote_file_size = response.headers.get("Content-Length")

    print(f"downloading: {url}")
    print(f"remote_file_size: {remote_file_size}")
    request.urlretrieve(url, tmp_file_name)

    downloaded_file_size = os.path.getsize(tmp_file_name)
    print(f"downloaded_file_size: {downloaded_file_size}")
    if remote_file_size and downloaded_file_size != int(remote_file_size):
        print(f"Error: downloaded file size {downloaded_file_size} does not match expected size {remote_file_size}")

    print(f"uploading wikipedia data in tmp: {tmp_file_name} to bucket:{output_bucket} key:{output_key}")
    put_file(local_path=tmp_file_name, bucket=output_bucket, file_name=output_key)
print(f"Notebook execution complete: {execution_id}")