# Extract: Wikipedia pageviews

Takes the airflow execution date and scan wikipedia data download for files for last <overlap_hours> hours. All files not already retreieved will be downloaded and unpacked.

Files will be put in a folder named: `output_dir/year/month/day/`

Files are named `yyyymmdd_hhmm_type.csv`

#### Parameters:

`execution_date_str` = the airflow exeuction date

`output_root` = path to directory where this pipeline will create the yearly folders with sublevels 

`overlap_hours` = how many hours back in time should we look for files, sometimes it takes a while for the data to be created in the source. 

In [1]:
# airflow execution date
execution_hour_str = "2025-03-12"
output_bucket = "data"
output_root_prefix = "wikipedia_pageviews"
overlap_hours = 3
force_reupload = False

In [2]:
import os

import boto3
import requests  # type: ignore[import-untyped]
from botocore.exceptions import ClientError

_s3_client = None


def get_s3_client():
    """Get an s3 client, implement the TFDS_S3NINJA_URL env to override the location from s3 config."""
    global _s3_client

    if _s3_client is not None:
        return _s3_client

    tfds_config_url = os.environ.get("TFDS_CONFIG_URL")
    if tfds_config_url is None:
        tfds_config_url = "http://tfds-config:8005/api/configs"
        print(f"TFDS_CONFIG_URL not set, using default: {tfds_config_url}")
    else:
        print(f"using TFDS_CONFIG_URL: {tfds_config_url}")
    tfds_config_url += "/s3"
    print(f"retrieving s3 config from {tfds_config_url}")
    response = requests.get(tfds_config_url)
    cfg = response.json().get("config")
    if cfg is None:
        raise ValueError(f"No config found in response from config server: {response.text}")
    if "TFDS_S3NINJA_URL" in os.environ.keys():
        cfg["url"] = os.environ["TFDS_S3NINJA_URL"]

    print(f"using s3 endpoint: {cfg['url']}")
    _s3_client = boto3.client(
        service_name="s3",
        aws_access_key_id=cfg["access_key"],
        aws_secret_access_key=cfg["secret_key"],
        endpoint_url=cfg["url"],
    )
    return _s3_client


def check_file_exists(bucket_name, file_key):
    """Check if a file exists on S3."""
    try:
        get_s3_client().head_object(Bucket=bucket_name, Key=file_key)
        return True
    except ClientError as e:
        if e.response["Error"]["Code"] == "404":
            return False
        else:
            print(f"Error checking file: {e}")
            raise

In [3]:
import datetime
from urllib import request

print(f"execution_hour_str: {execution_hour_str}")
print(f"output_bucket: {output_bucket}")
print(f"output_root_prefix: {output_root_prefix}")
print(f"overlap_hours: {overlap_hours}")


pattern = "%Y-%m-%d"
if ":" in execution_hour_str:
    pattern = "%Y-%m-%d %H:%M:%S"
if "T" in execution_hour_str:
    pattern = "%Y-%m-%dT%H:%M:%S"
print(f"using pattern: {pattern} to parse execution_hour_str: {execution_hour_str}")
execution_hour = datetime.datetime.strptime(execution_hour_str, pattern)

print(f"execution_hour: {execution_hour}")
tmp_file_name = os.path.abspath("./tmp.gz")
for retrieve_hour in (execution_hour - datetime.timedelta(hours=i) for i in range(overlap_hours)):
    output_prefix = (
        f"{output_root_prefix}/"
        f"{retrieve_hour.year}/"
        f"{retrieve_hour.year}-{retrieve_hour.month:0>2}/"
        f"{retrieve_hour.day:0>2}"
    )
    output_key = (
        f"{output_prefix}/"
        f"pageviews-{retrieve_hour.year}{retrieve_hour.month:0>2}{retrieve_hour.day:0>2}-{retrieve_hour.hour:0>2}0000"
        ".gz"
    )

    if not force_reupload and check_file_exists(output_bucket, output_key):
        print(f"file already downloaded: {output_key}")
        continue

    url = (
        "https://dumps.wikimedia.org/other/pageviews/"
        f"{retrieve_hour.year}/{retrieve_hour.year}-{retrieve_hour.month:0>2}/"
        f"pageviews-{retrieve_hour.year}{retrieve_hour.month:0>2}{retrieve_hour.day:0>2}-{retrieve_hour.hour:0>2}0000"
        ".gz"
    )

    response = requests.head(url)
    if response.status_code != 200:
        print(f"no file found for: {retrieve_hour} at {url}")
        continue

    remote_file_size = response.headers.get("Content-Length")

    print(f"downloading: {url}")
    print(f"remote_file_size: {remote_file_size}")
    request.urlretrieve(url, tmp_file_name)

    downloaded_file_size = os.path.getsize(tmp_file_name)
    print(f"downloaded_file_size: {downloaded_file_size}")
    if remote_file_size and downloaded_file_size != int(remote_file_size):
        print(f"Error: downloaded file size {downloaded_file_size} does not match expected size {remote_file_size}")

    with open(tmp_file_name, "rb") as f:
        print(f"uploading wikipedia data in tmp: {tmp_file_name} to bucket:{output_bucket} key:{output_key}")
        get_s3_client().upload_fileobj(f, output_bucket, output_key)
print("Execution complete")

execution_hour_str: 2025-03-12
output_bucket: data
output_root_prefix: wikipedia_pageviews
overlap_hours: 3
using pattern: %Y-%m-%d to parse execution_hour_str: 2025-03-12
execution_hour: 2025-03-12 00:00:00
TFDS_CONFIG_URL not set, using default: http://tfds-config:8005/api/configs
retrieving s3 config from http://tfds-config:8005/api/configs/s3
using s3 endpoint: http://s3-minio:9000
downloading: https://dumps.wikimedia.org/other/pageviews/2025/2025-03/pageviews-20250312-000000.gz
remote_file_size: 59277391
downloaded_file_size: 59277391
uploading wikipedia data in tmp: /Users/jens/src/pipe-dreams/notebooks/tmp.gz to bucket:data key:wikipedia_pageviews/2025/2025-03/12/pageviews-20250312-000000.gz
downloading: https://dumps.wikimedia.org/other/pageviews/2025/2025-03/pageviews-20250311-230000.gz
remote_file_size: 64235316
downloaded_file_size: 64235316
uploading wikipedia data in tmp: /Users/jens/src/pipe-dreams/notebooks/tmp.gz to bucket:data key:wikipedia_pageviews/2025/2025-03/11/pa