In [1]:
import pathlib

data_folderpath = pathlib.Path("./data")

ppd_folderpath = data_folderpath / "uk-ppd"
inflation_filepath = data_folderpath / "uk-ons/ons-inflation-1989-2022.csv"
interest_filepath = data_folderpath / "uk-boe/boe-interest-1975-2022.csv"


In [2]:
from tqdm import tqdm

tqdm.pandas()


In [12]:
import pandas as pd

def compile_missing(df):
    df_missing = pd.DataFrame(df.isna().sum())
    df_missing = df_missing.rename(columns={0: "missing"})
    df_missing["%"] = df_missing["missing"].map(lambda x: str(round(x * 100. / len(df), 2)) + "%")
    df_missing = df_missing[df_missing.missing > 0]
    return df_missing


In [4]:
def compile_uniques(df):
    df_uniques = pd.DataFrame(df.drop_duplicates().nunique())
    df_uniques = df_uniques.rename(columns={0: "uniques"})
    df_uniques["%"] = df_uniques["uniques"].map(lambda x: str(round(x * 100. / len(df), 2)) + "%")
    df_uniques = df_uniques[df_uniques.uniques > 1]
    return df_uniques

# Price Paid Data

In [5]:
import pandas as pd

from typing import Callable


def build_count_properties_sold(ix: pd.DataFrame, n_days: int) -> Callable[[pd.DataFrame], int]:
    def count_properties_sold(row: pd.DataFrame) -> int:
        if row.date and row.postgroup and row.property_type:
            slice = ix.loc[row.date - pd.Timedelta(days=n_days) : row.date, row.postgroup, row.property_type]
            return slice.sum()
        else:
            return None

    return count_properties_sold


# https://www.gov.uk/guidance/about-the-price-paid-data
ppd_property_type = {"D": "detached", "S": "semi-detached", "T": "terraced", "F": "flat/maisonettes"}
ppd_duration = {"F": "freehold", "L": "leasehold"}
ppd_old_or_new = {"Y": "new", "N": "old"}
ppd_cols_src = [
    "id",
    "price",
    "date",
    "postcode",
    "property_type",
    "old_or_new",
    "duration",
    "paon",
    "saon",
    "street",
    "locality",
    "town_city",
    "district",
    "county",
    "ppd_category_type",
    "record_status",
]
ppdf_cols_dst = [
    "date",
    "postgroup",
    "property_type",
    "old_or_new",
    "duration",
    "price",
]
ppd_filepaths = list(ppd_folderpath.glob("*.zip"))
ppd_df = pd.concat([pd.read_csv(fp, names=ppd_cols_src) for fp in tqdm(ppd_filepaths)])

100%|██████████| 5/5 [00:21<00:00,  4.36s/it]


In [7]:
compile_missing(df=ppd_df)

Unnamed: 0,missing,%
postcode,18471,0.4%
saon,4013617,87.29%
street,86816,1.89%
locality,2846155,61.9%


In [8]:
compile_uniques(df=ppd_df)

Unnamed: 0,uniques,%
id,4597894,100.0%
price,96000,2.09%
date,1758,0.04%
postcode,1044446,22.72%
property_type,5,0.0%
old_or_new,2,0.0%
duration,2,0.0%
paon,243571,5.3%
saon,30493,0.66%
street,278907,6.07%


# Inflation Rate Data

In [13]:
import string
inflation_acceptable_numeric_chars = string.digits + ".,"
def extract_inflation_rate(x: str) -> float:
    x = str(x)
    if all([c in inflation_acceptable_numeric_chars for c in x]):
        return float(x)
    return None

inflation_df = pd.read_csv(inflation_filepath)
inflation_df["date"] = inflation_df["Title"]
inflation_df["rate"] = inflation_df["CPIH ANNUAL RATE 00: ALL ITEMS 2015=100"].map(extract_inflation_rate)
inflation_df["rate"] = inflation_df["rate"].astype("float", errors="ignore")
inflation_df = inflation_df[["date", "rate"]]

In [19]:
inflation_df.info()
inflation_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 582 entries, 0 to 581
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    582 non-null    object 
 1   rate    575 non-null    float64
dtypes: float64(1), object(1)
memory usage: 9.2+ KB


Unnamed: 0,date,rate
0,CDID,
1,Source dataset ID,
2,PreUnit,
3,Unit,
4,Release date,
...,...,...
577,2022 JUL,8.8
578,2022 AUG,8.6
579,2022 SEP,8.8
580,2022 OCT,9.6


In [14]:
compile_missing(df=inflation_df)

Unnamed: 0,missing,%
rate,7,1.2%


In [16]:
compile_uniques(df=inflation_df)

Unnamed: 0,uniques,%
date,582,100.0%
rate,76,13.06%
