### 01 - Download CMS Medicare Part D data files (CSV format) to Lakehouse

- [CMS Medicare Part D Prescribers - by Provider and Drug](https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug) dataset is available for download from CMS Website. 
- This Notebook use [Public API Open Data Catalog](https://data.cms.gov/data.json) metadata json file published by CMS to identify and download dataset files to the Lakehouse
- Dataset contains one file for each year, Title field available for each in Metadata json is used tor identity the year value. Example - Title "Medicare Part D Prescribers - by Provider and Drug : 2016-12-31" indicates the file is for the year 2016

In [None]:
%run Utils

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 12, Finished, Available, Finished)

In [None]:
config_dict = get_config_dict()
#config_dict["lakehouse_id"] = notebookutils.lakehouse.get(config_dict["lakehouse_name"])['id']

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 13, Finished, Available, Finished)

In [None]:
#create the sub-directory in Files folder where the CSV files will be downloaded
lakehouse_dir = "Files/cms_raw"
lakehouse_dir_abfss_full_path = get_full_abfss_path(config_dict['workspace_id'], config_dict['lakehouse_id'], lakehouse_dir)

notebookutils.fs.mkdirs(lakehouse_dir_abfss_full_path)

print(f"lakehouse_dir: {lakehouse_dir}")
print(f"lakehouse_dir_abfss_full_path: {lakehouse_dir_abfss_full_path}")

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 14, Finished, Available, Finished)

lakehouse_dir: Files/cms_raw
lakehouse_dir_abfss_full_path: abfss://c8f59358-83d0-4711-8b57-ebc3c414b0a1@onelake.dfs.fabric.microsoft.com/93c691f7-1409-44ef-ab45-fa3a3fb74289/Files/cms_raw


In [None]:
#local mount path is required for plain python to save files to Lakehouse file section
mount_point = "/mnt/lakehouse/" + config_dict["lakehouse_name"] + "/" + lakehouse_dir
lakehouse_dir_local_mount_path = mount_path_return_local_path(lakehouse_dir_abfss_full_path, mount_point)

print(f'mount point: {mount_point}')
print(f"lakehouse_dir_local_path: {lakehouse_dir_local_mount_path}")

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 84fa7108-056e-4a99-aa81-aba6c561fe0a)

mount point: /mnt/lakehouse/cms_lakehouse2/Files/cms_raw
lakehouse_dir_local_path: /synfs/notebook/a42d77ff-8079-4f32-9fde-39570fed60a7/mnt/lakehouse/cms_lakehouse2/Files/cms_raw


In [None]:
#full_path = "abfss://c8f59358-83d0-4711-8b57-ebc3c414b0a1@onelake.dfs.fabric.microsoft.com/93c691f7-1409-44ef-ab45-fa3a3fb74289/Files/temp"
#mount_path = "/mnt/cms_lakehouse2/tmp"
#notebookutils.fs.mount(full_path, mount_path)

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 16, Finished, Available, Finished)

In [None]:
# Define the file name and content
#local_path = notebookutils.fs.getMountPath(mount_path)
#print(local_path)
#file_name = local_path +  "/test_file.txt"
#content = "This is a test file.\nIt contains sample text written using Python."

# Write the content to the file
#with open(file_name, "w") as file:
#    file.write(content)

#print(f"File '{file_name}' has been created with sample content.")


StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 17, Finished, Available, Finished)

In [None]:
# Documentation provided at the following location  https://data.cms.gov/sites/default/files/2024-05/39b98adf-b5e0-4487-a19e-4dc5c1503d41/API%20Guide%20Formatted%201_5.pdf
# was used as basis for the following code which parses the Public API Open Data Catalog json file to identity the dataset files 

import requests
url = "https://data.cms.gov/data.json"
title= "Medicare Part D Prescribers - by Provider and Drug"
csv_distros =[]
response = requests.request("GET", url)

if response.ok:
    response = response.json()
    dataset = response['dataset']
    for set in dataset:
        if title == set['title']:
            for distro in set['distribution']:
                if 'mediaType' in distro.keys():
                    if distro['mediaType'] == "text/csv":
                        csv_distros.append(distro)        
else:
    error_message = f"An error occrred in downloading the files from CMS Website: {response}"
    print(error_message)
    notebookutils.notebook.exit(error_message, 1)

#print(csv_distros)

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 18, Finished, Available, Finished)

In [None]:
#create spark dataframe with rows for all files for the dataset
#downloadURL and title are the 2 fields of interest which are added as column in the dataframe
selected_dataset = [{"downloadURL": obj["downloadURL"], "title": obj["title"]} for obj in csv_distros]
df = spark.createDataFrame(selected_dataset)
display(df)

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 19, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a8133124-d4b5-4730-b36f-0e3a07fd2a9d)

In [None]:
from pyspark.sql.functions import regexp_extract

#identify Year value from the Title and add that as a column to dataframe
df = df.withColumn("year", regexp_extract("title", r"(\d{4})", 1))
display(df)

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 20, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 22b0e535-76f8-4dee-afa1-4898dc154950)

In [None]:
#todo: test purpose limiting to download of just 1 file
df = df.limit(1)
display(df)

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 21, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 90c46ba5-62fc-4531-b341-7dc8b0885d67)

In [None]:
import random, time


#function to download the file from URL
def download_file(url, filename, retries = 3, interval = 30):

    attempt = 0

    #usually APIs are rate limited so good idea to have retry pattern implemented for downloads
    while attempt < retries:
        try:

            response = requests.get(url)    
            print(f"Status Code: {response.status_code}")  # Print the status code
            response.raise_for_status()  # Check if the request was successful

            with open(filename, 'wb') as file:
                file.write(response.content)
            
            #file downloaded succesfully so break out of the while loop
            break       
        except requests.exceptions.RequestException as e:            

            attempt += 1
            print(f"Attempt {attempt} failed: {e}")

            if attempt < retries:
                print(f"Retrying in {interval} seconds...")
                interval = random.randint(15, 45)
                time.sleep(interval)
            else:
                print("All attempts failed. Download unsuccessful.")
                raise Exception("Failed to download file after multiple attempts")

#function to process each DataFrame Row which corresponds to single file in teh dataset
#downloaded file is named based on the year value associated with data
def process_partition(partition):
    for row in partition:
        year_value = row['year']
        #output_file = "/lakehouse/default/" + lakehouse_dir + "/" + row['year'] + ".csv"
        output_file = lakehouse_dir_local_mount_path + "/" + row['year'] + ".csv"
        download_file(row['downloadURL'], output_file)

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 22, Finished, Available, Finished)

In [None]:
#process the dataframe where each row represents a file to be downloaded from CMS file
df.rdd.foreachPartition(process_partition)

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 23, Finished, Available, Finished)

In [None]:
notebookutils.fs.unmount(mount_point)

StatementMeta(, a42d77ff-8079-4f32-9fde-39570fed60a7, 24, Finished, Available, Finished)

True