## Imports

In [None]:
from pathlib import Path

import datamate
import pandas as pd
from tqdm.notebook import tqdm

## Name For This Data

In [None]:
name = "price-paid-data/derived"
name_raw = "price-paid-data/raw"

## Load Necessary Configration

In [None]:
key = datamate.config.get_key_for_name(name=name)

path_data = datamate.config.get_path_data_for_key(key=key)
logger = datamate.logging.get_logger(key=key)

logger.info(f"{key=} loaded for {name=}")
logger.info(f"{path_data=}")

key_raw = datamate.config.get_key_for_name(name=name_raw)
path_data_raw = datamate.config.get_path_data_for_key(key=key_raw)

logger.info(f"Using {path_data_raw=} for {key_raw=}")

## Data Information

- Primary source of information is https://www.gov.uk/guidance/about-the-price-paid-data

|Data item | Explanation (where appropriate)|
| -- | -- |
|Transaction unique identifier | A reference number which is generated automatically recording each published sale. The number is unique and will change each time a sale is recorded.|
| Price	| Sale price stated on the transfer deed.|
| Date of Transfer| Date when the sale was completed, as stated on the transfer deed.|
| Postcode | This is the postcode used at the time of the original transaction. Note that postcodes can be reallocated and these changes are not reflected in the Price Paid Dataset.|
| Property Type | D = Detached, S = Semi-Detached, T = Terraced, F = Flats/Maisonettes, O = Other|
|Old/New | Indicates the age of the property and applies to all price paid transactions, residential and non-residential. Y = a newly built property, N = an established residential building |
|Duration | Relates to the tenure: F = Freehold, L= Leasehold etc.|
|PAON | Primary Addressable Object Name. Typically the house number or name. |
|SAON | Secondary Addressable Object Name. Where a property has been divided into separate units (for example, flats), the PAON (above) will identify the building and a SAON will be specified that identifies the separate unit/flat. |
|Street | |
|Locality | |
|Town/City | |
|District | |
|County | |
| PPD Category Type | Indicates the type of Price Paid transaction. A = Standard Price Paid entry, includes single residential property sold for value. B = Additional Price Paid entry including transfers under a power of sale/repossessions, buy-to-lets (where they can be identified by a Mortgage), transfers to non-private individuals and sales where the property type is classed as ‘Other’.|
|Record Status - monthly file only |Indicates additions, changes and deletions to the records.(see guide below). A = Addition, C = Change, D = Delete|


In [1]:
map_headers = {
    "Transaction unique identifier": "idTransaction",
    "Price": "price",
    "Date of Transfer": "date",
    "Postcode": "postcode",
    "Property Type": "propertyType",
    "Old/New": "buildType",
    "Duration": "ownershipType",
    "PAON": "primaryAddressableObjectName",
    "SAON": "secondaryAddressableObjectName",
    "Street": "street",
    "Locality": "locality",
    "Town/City": "townCity",
    "District": "district",
    "County": "county",
    "PPD Category Type": "transactionType",
    "Record Status - monthly file only": "recordStatus",
}

map_property_type = {
    "D": "detached",
    "S": "semi-detached",
    "T": "terraced",
    "F": "flat-or-maisonette",
    "O": "other",
}

map_build_type = {
    "Y": "new",
    "N": "old",
}

map_ownership_type = {
    "L": "leasehold",
    "F": "freehold",
    "U": "other",
}

map_transaction_type = {
    "A": "standard",
    "B": "non-standard",
}

map_record_status = {
    "A": "addition",
    "C": "change",
    "D": "deletion",
}

In [None]:
path_headers_raw = path_data_raw.joinpath("headers.csv")
headers = pd.read_csv(path_headers_raw).columns.tolist()

assert set(headers) == set(map_headers.keys()), f"The following do not match:\n{headers}\n{list(map_headers.keys())} do not match"

In [None]:
def clean_data(
    path: Path,
    map_headers: dict[str, str],
    map_property_type: dict[str, str],
    map_build_type: dict[str, str],
    map_ownership_type: dict[str, str],
    map_transaction_type: dict[str, str],
    map_record_status: dict[str, str],
) -> pd.DataFrame:
    return (
        pd.read_csv(path, names=map_headers.keys())
        .rename(columns=map_headers)
        .assign(
            price=lambda x: pd.to_numeric(x['price']).astype(float),
            date=lambda x: pd.to_datetime(x["date"]),
            postcode=lambda x: x["postcode"].astype("string"),
            propertyType=lambda x: x["propertyType"].astype("category").cat.rename_categories(map_property_type),
            buildType=lambda x: x["buildType"].astype("category").cat.rename_categories(map_build_type),
            ownershipType=lambda x: x["ownershipType"].astype("category").cat.rename_categories(map_ownership_type),
            primaryAddressableObjectName=lambda x: x["primaryAddressableObjectName"].astype("string"),
            secondaryAddressableObjectName=lambda x: x["secondaryAddressableObjectName"].astype("string"),
            street=lambda x: x["street"].astype("string"),
            locality=lambda x: x["locality"].astype("string"),
            townCity=lambda x: x["townCity"].astype("category"),
            district=lambda x: x["district"].astype("category"),
            county=lambda x: x["county"].astype("category"),
            transactionType=lambda x: x["transactionType"].astype("category").cat.rename_categories(map_transaction_type),
            recordStatus=lambda x: x["recordStatus"].astype("category").cat.rename_categories(map_record_status),
        )
        .drop(columns=["recordStatus"])
    )

# # Testing function:
# df = clean_data(
#     path=path_data_raw.joinpath("pp-2019.csv"), 
#     map_headers=map_headers,
#     map_property_type=map_property_type,
#     map_build_type=map_build_type,
#     map_ownership_type=map_ownership_type,
#     map_transaction_type=map_transaction_type,
#     map_record_status=map_record_status,
# )

In [None]:
list_df = []

for path in tqdm(list(path_data_raw.iterdir())):
    if path != path_headers_raw:
        logger.info(f"Cleaning data for {path=}")
        list_df.append(clean_data(
            path=path,
            map_headers=map_headers,
            map_property_type=map_property_type,
            map_build_type=map_build_type,
            map_ownership_type=map_ownership_type,
            map_transaction_type=map_transaction_type,
            map_record_status=map_record_status,
        ))
        
logger.info("Concatenating all the data together")
df = datamate.pandas.concat_preserving_categorical(list_df, ignore_index=True)
list_df = None

path_derived = path_data.joinpath("data.parquet")
logger.info(f"Saving data to {path_derived=}")
df.to_parquet(path=path_derived)

In [None]:
logger.info(f"{df.columns=}")
logger.info(f"{df.memory_usage().sum()=:,}")
logger.info(f"""{df.describe()=
}""")
logger.info(f"""{df.describe(include='category')=
}""")