# Read 2016-2020 PROMs data

## imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import requests
from zipfile import ZipFile
from io import BytesIO
import itertools

import warnings
import re

from utils.load_data import dir_digger, read_csv_zip, structure_name, get_meta, clean_data, downcast, download_extract_zip
from utils.data_dictionary import methods, column_meta

In [None]:
# from https://digital.nhs.uk/data-and-information/data-tools-and-services/data-services/patient-reported-outcome-measures-proms
URL_PROMS_DATA = {
    "2016_17": r"https://files.digital.nhs.uk/6C/A1D581/CSV%20Data%20Pack%202016-17%20Finalised.zip",
    "2017_18": r"https://files.digital.nhs.uk/70/5176AA/CSV%20Data%20Pack%20Final%201718.zip",
    "2018_19": r"https://files.digital.nhs.uk/52/A8FF7F/PROMs%20CSV%20Data%20Pack%20Finalised%202018-19.zip",
    "2019_20": "https://files.digital.nhs.uk/1F/51FEDE/PROMs%20CSV%20Data%20Pack%20Provisional%201920.zip",
}

## read online data

In [None]:
# read dfs from csv from online sources via URLS
df_dict = dict()
for URL_year, URL in URL_PROMS_DATA.items():
    for file_name, file in download_extract_zip(URL):
        if file_name.endswith(".csv"):
            df_dict.update(
                {
                    file_name.strip(".csv"): pd.read_csv(file, na_values=["*"]).apply(
                        downcast
                    )
                }
            )

In [None]:
# print dfs in memory
[print(name) for name in np.sort(list(df_dict.keys()))];

## collect the right dataframes by name

In [None]:
df_dict["Hip Replacement Provider 1617"].columns

In [None]:
for part in ["Hip", "Knee"]:
    for org in ["CCG", "Provider"]:
        pattern = fr"^{part} replacements? {org} [\d]{{4}}$"
        dfs = [
            df for name, df in df_dict.items() if re.match(pattern, name, flags=re.I)
        ]
        df = pd.concat(dfs)
        # unfortunately nobody seems to care abot good name structures
        # the following is done to get in line with the original parquet files
        # the function structure name is doing the final structuring
        df.columns = (
            df.columns.str.replace("Pre-Op Q", "t0")
            .str.replace("Post-Op Q", "t1")
            .str.replace("Knee Replacement", "oks")
            .str.replace("Hip Replacement", "ohs")
            .str.replace("-", "_")
            .str.replace(" ", "_")
            .str.lower()
        )
        df.to_parquet(f"../data/new_parquets/{part}-{org}.parquet".lower())

# some comments

* NHS uses different structures for link names over the years (see URLs)
* NHS uses different names in zip over the years (see list for plural/singular, cases, full words)
* NSH uses typo columns in csv see "Heath Gain Mean" (shouldn't this be "Health Gain Mean") "Equality Hip and Knee Replacement 1617"
* NHS uses different structure for same type of columns: "Hip Replacement Post-Op Q Score" vs "Hip Replacement OHS Post-Op Q Predicted"

* The structure which was used to make the original parquets is also not consistent (placing of t0/t1)