## Imports

In [None]:
import datetime

import datamate
import requests
from tqdm.notebook import tqdm

## Name For This Data

In [None]:
name = "price-paid-data/raw"

## Load Necessary Configration

In [None]:
key = datamate.config.get_key_for_name(name=name)

path_data = datamate.config.get_path_data_for_key(key=key)
logger = datamate.logging.get_logger(key=key)

logger.info(f"{key=} loaded for {name=}")
logger.info(f"{path_data=}")

## Data Information

- Primary source of information is https://www.gov.uk/guidance/about-the-price-paid-data

In [None]:
url_base = "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com"
filename = "pp-{year}.csv"

url_template = f"{url_base}/{filename}"

year_earliest = 1995

headers = [
    "Transaction unique identifier",
    "Price",
    "Date of Transfer",
    "Postcode",
    "Property Type",
    "Old/New",
    "Duration",
    "PAON",
    "SAON",
    "Street",
    "Locality",
    "Town/City",
    "District",
    "County",
    "PPD Category Type",
    "Record Status - monthly file only",
]

## Year

In [None]:
year_today = datetime.datetime.now().year
logger.info(f"{year_today=}")

## Historical Data Pull

In [None]:
path_headers = path_data.joinpath("headers.csv")
path_headers.write_text(",".join(f'"{header}"' for header in headers) + "\n")
logger.info(f"Headers dumped to {path_headers=}")

for year in tqdm(range(year_earliest, year_today)):
    url = url_template.format(year=year)
    logger.info(f"Reading {url=}")
    response = requests.get(url)
    
    path_file = path_data.joinpath(filename.format(year=year))
    logger.info(f"Dumping to {path_file=}")
    path_file.write_text(response.text)