In [7]:
import glob

import pandas as pd

import re

# Downloading Data

In [None]:
import json
from urllib.request import urlopen

In [None]:
data = []
offset = 0
limit = 10000
resource_id = "PCA_202108"
total = 1


while offset < total:
    # Form the url using offset, limit, resource_id
    url = f"https://opendata.nhsbsa.net/api/3/action/datastore_search?resource_id={resource_id}&limit={limit}&offset={offset}"
    # Openning the url and reading the data
    fileobj = urlopen(url)
    data_json = fileobj.read()
    # Parsing the data to convert from json to python dictionary
    data_dict = json.loads(data_json)
    # Appending the data from this url read to the "data" list
    data += data_dict["result"]["records"]

    offset += limit

    if total == 1:
        total = data_dict["result"]["total"]

In [None]:
len(data)

In [None]:
pd.DataFrame.from_records(data_dict["result"]["records"])

# Processing Data

## Monthly Filtered Data

Idea: filter raw data with the file format `./data/raw_data/pca_YYYYMM.csv` and save into `./data/filtered_data/pca_YYYYMM.csv`

In [2]:
def filter_raw_data(raw_df):
    return raw_df[["YEAR_MONTH", "BNF_PRESENTATION_CODE"]]

In [18]:
re.split("/|\\\\",raw_data_paths[0])

['.', 'data', 'raw_data', 'pca_202107.csv']

In [20]:
filtered_data_paths = glob.glob("./data/filtered_data/*.csv")
filtered_data_filenames = set(
    [re.split("/|\\\\",filtered_data_path)[-1] for filtered_data_path in filtered_data_paths]
)

raw_data_paths = glob.glob("./data/raw_data/*.csv")

for raw_data_path in raw_data_paths:
    raw_data_filename =re.split("/|\\\\",raw_data_path)[-1]
    if raw_data_filename not in filtered_data_filenames:
        print(f"Processing {raw_data_filename}")
        raw_df = pd.read_csv(raw_data_path)
        filtered_df = filter_raw_data(raw_df)
        output_path = f"./data/filtered_data/{raw_data_filename}"
        filtered_df.to_csv(output_path, index=False)

Processing pca_202107.csv
Processing pca_202108.csv


## Product Info

In [26]:
def process_product_data(monthly_raw_data):
    # Selecting a subset of columns
    filtered_df = monthly_raw_data[
        [
            "BNF_PRESENTATION_CODE",
            "BNF_PRESENTATION_NAME",
            "SNOMED_CODE",
            "GENERIC_BNF_EQUIVALENT_CODE",
            "GENERIC_BNF_EQUIVALENT_NAME",
            "BNF_CHEMICAL_SUBSTANCE_CODE",
            "BNF_CHEMICAL_SUBSTANCE",
            "BNF_PARAGRAPH_CODE",
            "BNF_PARAGRAPH",
            "BNF_SECTION_CODE",
            "BNF_SECTION",
            "BNF_CHAPTER_CODE",
            "BNF_CHAPTER",
            "PREP_CLASS",
            "PRESCRIBED_PREP_CLASS",
        ]
    ]

    # Drop duplicates by BNF_PRESENTATION_CODE
    monthly_product_info_df = filtered_df.drop_duplicates(
        subset=["BNF_PRESENTATION_CODE"]
    )
    return monthly_product_info_df

In [22]:
product_info_paths = glob.glob("./data/product_info/*.csv")
product_info_filenames = set(
    [re.split("/|\\\\", product_info_path)[-1] for product_info_path in product_info_paths]
)

raw_data_paths = glob.glob("./data/raw_data/*.csv")

In [28]:
for raw_data_path in raw_data_paths:
    raw_data_filename = re.split("/|\\\\", raw_data_path)[-1]
    if raw_data_filename not in product_info_filenames:
        print(f"Processing {raw_data_filename}")
        monthly_raw_df = pd.read_csv(raw_data_path)
        monthly_product_info_df = process_product_data(monthly_raw_df)
        output_path = f"./data/product_info/{raw_data_filename}"
        monthly_product_info_df.to_csv(output_path, index=False)

Processing pca_202107.csv
Processing pca_202108.csv


In [29]:
# Merge all monthly product info dataframes
product_info_paths = glob.glob("./data/product_info/*.csv")

all_product_info_df = pd.concat(
    [pd.read_csv(product_info_path) for product_info_path in product_info_paths]
)
all_product_info_df = all_product_info_df.drop_duplicates(
    subset=["BNF_PRESENTATION_CODE"]
)

all_product_info_df.to_csv("./data/product_info_concatenated.csv", index=False)

In [30]:
all_product_info_df.head()

Unnamed: 0,BNF_PRESENTATION_CODE,BNF_PRESENTATION_NAME,SNOMED_CODE,GENERIC_BNF_EQUIVALENT_CODE,GENERIC_BNF_EQUIVALENT_NAME,BNF_CHEMICAL_SUBSTANCE_CODE,BNF_CHEMICAL_SUBSTANCE,BNF_PARAGRAPH_CODE,BNF_PARAGRAPH,BNF_SECTION_CODE,BNF_SECTION,BNF_CHAPTER_CODE,BNF_CHAPTER,PREP_CLASS,PRESCRIBED_PREP_CLASS
0,190700000BBCRA0,Thick & Easy Clear powder,2.909391e+16,190700000BBCRA0,Thick & Easy Clear powder,190700000,Other base/diluent/suspending agent/stabiliser...,190700,"Base,diluent, suspending agents and stabilisers",1907,"Base,diluent, suspending agents and stabilisers",19,Other Drugs and Preparations,3,3
1,0409010K0BBANAF,Madopar CR capsules,27911000000000.0,0409010K0AAAFAF,Co-beneldopa 25mg/100mg modified-release capsules,0409010K0,Co-beneldopa (Benserazide/levodopa),40901,Dopaminergic drugs used in parkinsonism,409,Drugs used in parkinsonism and related disorders,4,Central Nervous System,3,3
2,23803048001,Medihoney barrier cream sachets,1.977871e+16,23803048001,Medihoney barrier cream sachets,2380,Skin Fillers And Protectives,2380,Skin Fillers And Protectives,2380,Skin Fillers And Protectives,23,Stoma Appliances,4,4
3,0105010B0BCAFAT,Pentasa 2g modified-release granules sachets,1.370111e+16,0105010B0AAATAT,Mesalazine 2g modified-release granules sachet...,0105010B0,Mesalazine (Systemic),10501,Aminosalicylates,105,Chronic bowel disorders,1,Gastro-Intestinal System,3,2
4,090402000BBNZA0,Aptamil Pepti 1 powder,9325111000000000.0,090402000BBNZA0,Aptamil Pepti 1 powder,090402000,Enteral nutrition,90402,Enteral nutrition,904,Oral nutrition,9,Nutrition and Blood,3,3


One problem with this approach is that it gets slower every month as the raw data increases
    => Change the algorithm such that if there exists an existing product_info.csv, simply use that and the latest monthly raw_data to get the updated product_info.csv