In [31]:
from pathlib import Path
import os
import datamate
import pandas as pd
from tqdm.notebook import tqdm

## Name For This Data

In [2]:
name = "postcode-data/derived"
name_raw = "postcode-data/raw"

## Load Necessary Configuration

In [6]:
key = datamate.config.get_key_for_name(name=name)

path_data = datamate.config.get_path_data_for_key(key=key)
logger = datamate.logging.get_logger(key=key)

logger.info(f"{key=} loaded for {name=}")
logger.info(f"{path_data=}")

key_raw = datamate.config.get_key_for_name(name=name_raw)
path_data_raw = datamate.config.get_path_data_for_key(key=key_raw)

logger.info(f"Using {path_data_raw=} for {key_raw=}")

2023-03-01 19:14:58.621 INFO 1372638304 - <module>: key='postcode-data/derived' loaded for name='postcode-data/derived'
2023-03-01 19:14:58.623 INFO 1372638304 - <module>: path_data=PosixPath('/media/josie/hdd/data/house-data/postcode-data/derived')
2023-03-01 19:14:58.626 INFO 1372638304 - <module>: Using path_data_raw=PosixPath('/media/josie/hdd/data/house-data/postcode-data/raw') for key_raw='postcode-data/raw'


## Data Information

- Primary source of information is https://beta.ordnancesurvey.co.uk/products/code-point-open#technical

- The coordinates for postcodes in Great Britain (England, Wales, and Scotland) are provided in British National Grid (BNG). BNG uses the OSGB36 (EPSG 27700) geodetic datum and a single Transverse Mercator projection for the whole of Great Britain. Positions on this projection are described using Easting and Northing coordinates in units of metres. The BNG is a horizontal spatial reference system only; it does not specify a vertical (height) reference system.

|Data item | Explanation (where appropriate)|
| -- | -- |
|Postcode | A UK Postcode |
| Positional_quality_indicator | The accuracy of each postcode unit coordinate pair is defined by the Positional Quality indicator (PQI), which provides a quality statement for the position of that Code-Point Open record. There are seven PQI values for the positional quality. A lower positional quality indicator will be allocated to postcode units awaiting a surveyed position, or which relate to addresses that do not have a surveyed position. Full details of the PQI can be found in the product's Technical Specification, which is available from the Code-Point Open Product Support page on the OS website (https://www.ordnancesurvey.co.uk/businessgovernment/tools-support/code-point-open-support). |
| Eastings | Distance in metres east of National Grid origin.  |
| Northings | Distance in metres north of National Grid origin. |
| Country_code | Code used by the Office for National Statistics (ONS) to identify the country in which  the Code-Point Open georeferenced coordinates lies. |
| NHS_regional_HA_code | English Pan Strategic Health Authority in which Code-Point Location Coordinate (CPLC) falls. |
| NHS_HA_code | (Health Authority) English Strategic Health Authority or Scottish Health Board in which CPLC falls. |
| Admin_county_code | County in which CPLC falls. |
| Admin_district_code | Unitary Authority, Metropolitan and Non-Metropolitan District, London Borough, or Scottish Council Area in which CPLC falls. |
| Admin_ward_code | Electoral Ward or Division in which CPLC falls. |



In [8]:
csv_filenames = os.listdir(path_data_raw.joinpath("unzipped/data/CSV"))
categories = [x.split('.')[0] for x in csv_filenames]

In [9]:
headers = pd.read_csv(path_data_raw.joinpath("unzipped/Doc/Code-Point_Open_Column_Headers.csv")).iloc[0].to_list()

In [10]:
df_list = [pd.read_csv(path_data_raw.joinpath(f"unzipped/data/CSV/{x}"), names=headers) for x in csv_filenames]

In [27]:
for index, df in enumerate(df_list):
    df['File'] = categories[index]
    df['File'] = pd.Categorical(df['File'], categories=categories)

In [28]:
postcodes = pd.concat(df_list)

In [32]:
postcodes.to_csv(path_data.joinpath("derived.csv"))

In [33]:
logger.info(f"{postcodes.columns=}")
logger.info(f"{postcodes.memory_usage().sum()=:,}")
logger.info(f"""{postcodes.describe()=
}""")
logger.info(f"""{postcodes.describe(include='category')=
}""")

2023-03-01 20:01:49.872 INFO 1280416684 - <module>: postcodes.columns=Index(['Postcode', 'Positional_quality_indicator', 'Eastings', 'Northings',
       'Country_code', 'NHS_regional_HA_code', 'NHS_HA_code',
       'Admin_county_code', 'Admin_district_code', 'Admin_ward_code', 'File'],
      dtype='object')
2023-03-01 20:01:49.875 INFO 1280416684 - <module>: postcodes.memory_usage().sum()=154,152,149
2023-03-01 20:01:50.048 INFO 1280416684 - <module>: postcodes.describe()=
       Positional_quality_indicator      Eastings     Northings
count                  1.731989e+06  1.731989e+06  1.731989e+06
mean                   1.014623e+01  4.281732e+05  3.130745e+05
std                    2.693992e+00  1.003995e+05  1.761264e+05
min                    1.000000e+01  0.000000e+00  0.000000e+00
25%                    1.000000e+01  3.586950e+05  1.786830e+05
50%                    1.000000e+01  4.298610e+05  2.787650e+05
75%                    1.000000e+01  5.135090e+05  4.072740e+05
max       