
#### Run the cell below to install the required packages for Copilot


## Load CMS Medicare Part D data to Lakehouse

1. Download CMS Dataset files to your local from CMS Website - [Medicare Part D Prescribers - by Provider and Drug](https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug)
2. Upload the files to the Files section of Lakehouse using the web interface or Azure Storage Explorer as per the instructions [here](https://learn.microsoft.com/en-us/fabric/onelake/onelake-azure-storage-explorer)



In [1]:
spark.sql("DROP TABLE IF EXISTS cms_provider_drug_costs")

StatementMeta(, bc27a890-6295-46fc-a09c-ac494d9fa4b4, 3, Finished, Available)

DataFrame[]

In [2]:
file_dict = {
                2013 : "Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY13_NPIBN_4.csv",                 
                2014 : "Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY14_NPIBN_1.csv",
                2015 :  "Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY15_NPIBN_1.csv",
                2016 : "Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY16_NPIBN_0.csv",
                2017 : "Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY17_NPIBN_1.csv",
                2018 : "Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY18_NPIBN_0.csv",
                2019 : "Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY19_NPIBN_1.csv",
                2020 : "Files/cmsdrugprescriptions/MUP_DPR_RY22_P04_V10_DY20_NPIBN_0.csv",
                2021 : "Files/cmsdrugprescriptions/MUP_DPR_RY23_P04_V10_DY21_NPIBN.csv"
            }

StatementMeta(, bc27a890-6295-46fc-a09c-ac494d9fa4b4, 4, Finished, Available)

In [3]:
from pyspark.sql.types import LongType, DecimalType
from pyspark.sql.functions import lit, col, concat

first_file = True

for key, v in file_dict.items():
    print(f"Key: {key}, Value: {v}")

    df = spark.read.format("csv").option("header","true").option("inferschema","true").load(v)

    if first_file == True :
        mode = "overwrite"
        first_file = False #changing the flag to false for next run
        df.printSchema()
    else:
        mode = "append"

    df = df.withColumn("Year", lit(key)) \
        .withColumn("Tot_Drug_Cst", df.Tot_Drug_Cst.cast(DecimalType(10,2))) \
        .withColumn("Tot_30day_Fills", df.Tot_30day_Fills.cast(DecimalType(10,2))) \
        .withColumn("GE65_Tot_30day_Fills", df.GE65_Tot_30day_Fills.cast(DecimalType(10,2))) \
        .withColumn("GE65_Tot_Drug_Cst", df.GE65_Tot_Drug_Cst.cast(DecimalType(10,2))) \
        .withColumn("Prscrbr_City_State", concat(df.Prscrbr_City, lit(", "), df.Prscrbr_State_Abrvtn)) \
        .withColumn("Prscrbr_Full_Name", concat(df.Prscrbr_Last_Org_Name, lit(", "), df.Prscrbr_First_Name)) \
        .withColumn("Tot_Clms", df.Tot_Clms.cast(LongType())) \
        .withColumn("Tot_Day_Suply", df.Tot_Day_Suply.cast(LongType())) \
        .withColumn("Tot_Benes", df.Tot_Benes.cast(LongType())) \
        .withColumn("GE65_Tot_Clms", df.GE65_Tot_Clms.cast(LongType())) \
        .withColumn("GE65_Tot_Benes", df.GE65_Tot_Benes.cast(LongType())) \
        .withColumn("GE65_Tot_Day_Suply", df.GE65_Tot_Day_Suply.cast(LongType()))

    display(df)

    print(f'Writing {key} data to table - {df.count()} records')  
    df.write.mode(mode).format('delta').save(f"Tables/cms_provider_drug_costs")


StatementMeta(, bc27a890-6295-46fc-a09c-ac494d9fa4b4, 5, Finished, Available)

Key: 2013, Value: Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY13_NPIBN_4.csv
root
 |-- Prscrbr_NPI: integer (nullable = true)
 |-- Prscrbr_Last_Org_Name: string (nullable = true)
 |-- Prscrbr_First_Name: string (nullable = true)
 |-- Prscrbr_City: string (nullable = true)
 |-- Prscrbr_State_Abrvtn: string (nullable = true)
 |-- Prscrbr_State_FIPS: string (nullable = true)
 |-- Prscrbr_Type: string (nullable = true)
 |-- Prscrbr_Type_Src: string (nullable = true)
 |-- Brnd_Name: string (nullable = true)
 |-- Gnrc_Name: string (nullable = true)
 |-- Tot_Clms: integer (nullable = true)
 |-- Tot_30day_Fills: double (nullable = true)
 |-- Tot_Day_Suply: integer (nullable = true)
 |-- Tot_Drug_Cst: double (nullable = true)
 |-- Tot_Benes: integer (nullable = true)
 |-- GE65_Sprsn_Flag: string (nullable = true)
 |-- GE65_Tot_Clms: integer (nullable = true)
 |-- GE65_Tot_30day_Fills: double (nullable = true)
 |-- GE65_Tot_Drug_Cst: double (nullable = true)
 |-- GE65_Tot_Day_Suply: intege

SynapseWidget(Synapse.DataFrame, 915918d9-00de-4a96-805d-3a56297aefaf)

Writing 2013 data to table - 23645873 records
Key: 2014, Value: Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY14_NPIBN_1.csv


SynapseWidget(Synapse.DataFrame, b1771d13-4aa0-4d2e-85d5-53dc3cc1dc63)

Writing 2014 data to table - 24120618 records
Key: 2015, Value: Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY15_NPIBN_1.csv


SynapseWidget(Synapse.DataFrame, 0002db96-9a0f-4891-bf6f-d27834e638be)

Writing 2015 data to table - 24524894 records
Key: 2016, Value: Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY16_NPIBN_0.csv


SynapseWidget(Synapse.DataFrame, 83085762-f776-49e3-9c76-81be2368a320)

Writing 2016 data to table - 24964300 records
Key: 2017, Value: Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY17_NPIBN_1.csv


SynapseWidget(Synapse.DataFrame, d2ec7a4f-7be0-47d1-8310-2ba56ef7ff13)

Writing 2017 data to table - 25209130 records
Key: 2018, Value: Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY18_NPIBN_0.csv


SynapseWidget(Synapse.DataFrame, a626c67f-27df-4a90-b22f-5e8000e5b0e8)

Writing 2018 data to table - 25311600 records
Key: 2019, Value: Files/cmsdrugprescriptions/MUP_DPR_RY21_P04_V10_DY19_NPIBN_1.csv


SynapseWidget(Synapse.DataFrame, bfb66b4e-f3e9-4dc2-a899-585d94db3349)

Writing 2019 data to table - 25401870 records
Key: 2020, Value: Files/cmsdrugprescriptions/MUP_DPR_RY22_P04_V10_DY20_NPIBN_0.csv


SynapseWidget(Synapse.DataFrame, db7b14cb-6b96-49c7-8d03-43d865099a54)

Writing 2020 data to table - 25209729 records
Key: 2021, Value: Files/cmsdrugprescriptions/MUP_DPR_RY23_P04_V10_DY21_NPIBN.csv


SynapseWidget(Synapse.DataFrame, bb62bf9e-f298-4df9-909e-28e88792c3a6)

Writing 2021 data to table - 25231862 records


In [4]:
df = spark.read.table("cms_provider_drug_costs")

display(df)

StatementMeta(, bc27a890-6295-46fc-a09c-ac494d9fa4b4, 6, Finished, Available)

SynapseWidget(Synapse.DataFrame, 6bc3cb3f-88a0-4fef-9290-d7ca88b6aee8)

In [5]:
print(df.count())

StatementMeta(, bc27a890-6295-46fc-a09c-ac494d9fa4b4, 7, Finished, Available)

223619876


In [6]:
df.printSchema()

StatementMeta(, bc27a890-6295-46fc-a09c-ac494d9fa4b4, 8, Finished, Available)

root
 |-- Prscrbr_NPI: integer (nullable = true)
 |-- Prscrbr_Last_Org_Name: string (nullable = true)
 |-- Prscrbr_First_Name: string (nullable = true)
 |-- Prscrbr_City: string (nullable = true)
 |-- Prscrbr_State_Abrvtn: string (nullable = true)
 |-- Prscrbr_State_FIPS: string (nullable = true)
 |-- Prscrbr_Type: string (nullable = true)
 |-- Prscrbr_Type_Src: string (nullable = true)
 |-- Brnd_Name: string (nullable = true)
 |-- Gnrc_Name: string (nullable = true)
 |-- Tot_Clms: long (nullable = true)
 |-- Tot_30day_Fills: decimal(10,2) (nullable = true)
 |-- Tot_Day_Suply: long (nullable = true)
 |-- Tot_Drug_Cst: decimal(10,2) (nullable = true)
 |-- Tot_Benes: long (nullable = true)
 |-- GE65_Sprsn_Flag: string (nullable = true)
 |-- GE65_Tot_Clms: long (nullable = true)
 |-- GE65_Tot_30day_Fills: decimal(10,2) (nullable = true)
 |-- GE65_Tot_Drug_Cst: decimal(10,2) (nullable = true)
 |-- GE65_Tot_Day_Suply: long (nullable = true)
 |-- GE65_Bene_Sprsn_Flag: string (nullable = true

In [None]:
%%sql

select sum(Tot_Day_Suply) from cms_provider_drug_costs

In [None]:
%%sql

DESCRIBE DETAIL cms_provider_drug_costs

StatementMeta(, , , Cancelled, )