§# Extract: Wikipedia pageviews

Takes the airflow execution date and scan wikipedia data download for files for last 24 hours. All files not already retreieved will be downloaded and unpacked.

Files will be put in a folder named: `output_dir/year/month/day/`

Files are named `yyyymmdd_hhmm_type.csv`

#### Parameters:

`execution_date_str` = the airflow exeuction date

`output_root` = path to directory where this pipeline will create the yearly folders with sublevels 

`overlap_hours` = how many hours back in time should we look for files, sometimes it takes a while for the data to be created in the source. 

In [3]:
# airflow execution date
execution_hour_str = "2025-03-10 10:00:00"
output_root = "/tmp/airflow/airflow_output"
overlap_hours = 3

In [None]:
import datetime
import os
from urllib import request

import requests  # type: ignore[import-untyped]

print(f"execution_hour_str: {execution_hour_str}")
print(f"output_root: {output_root}")

execution_hour = datetime.datetime.strptime(execution_hour_str, "%Y-%m-%d %H:%M:%S")

print(f"execution_hour: {execution_hour}")


for retrieve_hour in (execution_hour - datetime.timedelta(hours=i) for i in range(24)):
    output_dir = (
        f"{output_root}/{retrieve_hour.year}/{retrieve_hour.year}-{retrieve_hour.month:0>2}/{retrieve_hour.day:0>2}"
    )

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_file = (
        f"{output_dir}/"
        f"pageviews-{retrieve_hour.year}{retrieve_hour.month:0>2}{retrieve_hour.day:0>2}-{retrieve_hour.hour:0>2}0000"
        f".gz"
    )
    if os.path.isfile(output_file):
        print(f"file already downloaded: {output_file}")
        continue

    url = (
        "https://dumps.wikimedia.org/other/pageviews/"
        f"{retrieve_hour.year}/{retrieve_hour.year}-{retrieve_hour.month:0>2}/"
        f"pageviews-{retrieve_hour.year}{retrieve_hour.month:0>2}{retrieve_hour.day:0>2}-{retrieve_hour.hour:0>2}0000"
        f".gz"
    )

    response = requests.head(url)
    if response.status_code != 200:
        print(f"no file find for: {retrieve_hour} at {url}")
        continue
    file_size = response.headers.get("Content-Length")

    print(f"downloading: {url}")
    print(f"file_size: {file_size}")
    request.urlretrieve(url, output_file)

    downloaded_file_size = os.path.getsize(output_file)
    print(f"downloaded_file_size: {downloaded_file_size}")
    print(f"downloaded_file: {output_file}")
    if file_size and downloaded_file_size != int(file_size):
        print(f"Error: downloaded file size {downloaded_file_size} does not match expected size {file_size}")

execution_hour_str: 2025-03-10 10:00:00
output_root: /tmp/airflow/airflow_output
execution_hour: 2025-03-10 10:00:00
file already downloaded: /tmp/airflow/airflow_output/2025/2025-03/10/pageviews-20250310-100000.gz
file already downloaded: /tmp/airflow/airflow_output/2025/2025-03/10/pageviews-20250310-090000.gz
file already downloaded: /tmp/airflow/airflow_output/2025/2025-03/10/pageviews-20250310-080000.gz
file already downloaded: /tmp/airflow/airflow_output/2025/2025-03/10/pageviews-20250310-070000.gz
file already downloaded: /tmp/airflow/airflow_output/2025/2025-03/10/pageviews-20250310-060000.gz
file already downloaded: /tmp/airflow/airflow_output/2025/2025-03/10/pageviews-20250310-050000.gz
file already downloaded: /tmp/airflow/airflow_output/2025/2025-03/10/pageviews-20250310-040000.gz
file already downloaded: /tmp/airflow/airflow_output/2025/2025-03/10/pageviews-20250310-030000.gz
downloading: https://dumps.wikimedia.org/other/pageviews/2025/2025-03/pageviews-20250310-020000.gz


KeyboardInterrupt: 