## Load CMS Medicare Part D data to Lakehouse

1. Download CMS Dataset files to your local from CMS Website - [Medicare Part D Prescribers - by Provider and Drug](https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug)
2. Upload the files to the Files section of Lakehouse using the web interface or Azure Storage Explorer as per the instructions [here](https://learn.microsoft.com/en-us/fabric/onelake/onelake-azure-storage-explorer)



In [13]:
spark.sql("DROP TABLE IF EXISTS cms_provider_drug_costs")

StatementMeta(, 6d4e45c9-5c85-449c-8cfc-4df4f4839f52, 15, Finished, Available)

DataFrame[]

In [14]:
file_dict = {
                2013 : "Files/MUP_DPR_RY21_P04_V10_DY13_NPIBN_4.csv",                 
                2014 : "Files/MUP_DPR_RY21_P04_V10_DY14_NPIBN_1.csv",
                2015 :  "Files/MUP_DPR_RY21_P04_V10_DY15_NPIBN_1.csv",
                2016 : "Files/MUP_DPR_RY21_P04_V10_DY16_NPIBN_0.csv",
                2017 : "Files/MUP_DPR_RY21_P04_V10_DY17_NPIBN_1.csv",
                2018 : "Files/MUP_DPR_RY21_P04_V10_DY18_NPIBN_0.csv",
                2019 : "Files/MUP_DPR_RY21_P04_V10_DY19_NPIBN_1.csv",
                2020 : "Files/MUP_DPR_RY22_P04_V10_DY20_NPIBN_0.csv",
                2021 : "Files/MUP_DPR_RY23_P04_V10_DY21_NPIBN.csv"
            }

StatementMeta(, 6d4e45c9-5c85-449c-8cfc-4df4f4839f52, 16, Finished, Available)

In [18]:
from pyspark.sql.types import LongType, DecimalType
from pyspark.sql.functions import lit, col, concat

first_file = True

for key, v in file_dict.items():
    print(f"Key: {key}, Value: {v}")

    df = spark.read.format("csv").option("header","true").option("inferschema","true").load(v)

    if first_file == True :
        mode = "overwrite"
        first_file = False #changing the flag to false for next run
        df.printSchema()
    else:
        mode = "append"

    df = df.withColumn("Year", lit(key)) \
        .withColumn("Tot_Drug_Cst", df.Tot_Drug_Cst.cast(DecimalType(10,2))) \
        .withColumn("Tot_30day_Fills", df.Tot_30day_Fills.cast(DecimalType(10,2))) \
        .withColumn("GE65_Tot_30day_Fills", df.GE65_Tot_30day_Fills.cast(DecimalType(10,2))) \
        .withColumn("GE65_Tot_Drug_Cst", df.GE65_Tot_Drug_Cst.cast(DecimalType(10,2))) \
        .withColumn("Prscrbr_City_State", concat(df.Prscrbr_City, lit(", "), df.Prscrbr_State_Abrvtn)) \
        .withColumn("Prscrbr_Full_Name", concat(df.Prscrbr_Last_Org_Name, lit(", "), df.Prscrbr_First_Name)) \
        .withColumn("Tot_Clms", df.Tot_Clms.cast(LongType())) \
        .withColumn("Tot_Day_Suply", df.Tot_Day_Suply.cast(LongType())) \
        .withColumn("Tot_Benes", df.Tot_Benes.cast(LongType())) \
        .withColumn("GE65_Tot_Clms", df.GE65_Tot_Clms.cast(LongType())) \
        .withColumn("GE65_Tot_Benes", df.GE65_Tot_Benes.cast(LongType())) \
        .withColumn("GE65_Tot_Day_Suply", df.GE65_Tot_Day_Suply.cast(LongType()))

    display(df)

    print(f'Writing {key} data to table - {df.count()} records')  
    df.write.mode(mode).format('delta').save(f"Tables/cms_provider_drug_costs")


StatementMeta(, 6d4e45c9-5c85-449c-8cfc-4df4f4839f52, 20, Finished, Available)

Key: 2013, Value: Files/MUP_DPR_RY21_P04_V10_DY13_NPIBN_4.csv
root
 |-- Prscrbr_NPI: integer (nullable = true)
 |-- Prscrbr_Last_Org_Name: string (nullable = true)
 |-- Prscrbr_First_Name: string (nullable = true)
 |-- Prscrbr_City: string (nullable = true)
 |-- Prscrbr_State_Abrvtn: string (nullable = true)
 |-- Prscrbr_State_FIPS: string (nullable = true)
 |-- Prscrbr_Type: string (nullable = true)
 |-- Prscrbr_Type_Src: string (nullable = true)
 |-- Brnd_Name: string (nullable = true)
 |-- Gnrc_Name: string (nullable = true)
 |-- Tot_Clms: integer (nullable = true)
 |-- Tot_30day_Fills: double (nullable = true)
 |-- Tot_Day_Suply: integer (nullable = true)
 |-- Tot_Drug_Cst: double (nullable = true)
 |-- Tot_Benes: integer (nullable = true)
 |-- GE65_Sprsn_Flag: string (nullable = true)
 |-- GE65_Tot_Clms: integer (nullable = true)
 |-- GE65_Tot_30day_Fills: double (nullable = true)
 |-- GE65_Tot_Drug_Cst: double (nullable = true)
 |-- GE65_Tot_Day_Suply: integer (nullable = true)
 

SynapseWidget(Synapse.DataFrame, 803c98cc-7378-4ae7-b351-669c293b4f29)

Writing 2013 data to table - 23645873 records
Key: 2014, Value: Files/MUP_DPR_RY21_P04_V10_DY14_NPIBN_1.csv


SynapseWidget(Synapse.DataFrame, 435c5e77-51f9-41b5-bbd3-af7a0dca57e8)

Writing 2014 data to table - 24120618 records
Key: 2015, Value: Files/MUP_DPR_RY21_P04_V10_DY15_NPIBN_1.csv


SynapseWidget(Synapse.DataFrame, 3682cff7-6ffb-4636-901c-33da050be4af)

Writing 2015 data to table - 24524894 records
Key: 2016, Value: Files/MUP_DPR_RY21_P04_V10_DY16_NPIBN_0.csv


SynapseWidget(Synapse.DataFrame, 916fa09e-ee20-4721-b532-237220598727)

Writing 2016 data to table - 24964300 records
Key: 2017, Value: Files/MUP_DPR_RY21_P04_V10_DY17_NPIBN_1.csv


SynapseWidget(Synapse.DataFrame, 2ced1352-6ad1-458f-9a7c-ff5d97640fc9)

Writing 2017 data to table - 25209130 records
Key: 2018, Value: Files/MUP_DPR_RY21_P04_V10_DY18_NPIBN_0.csv


SynapseWidget(Synapse.DataFrame, 79fd37c5-4a90-4528-bba5-8690297faf83)

Writing 2018 data to table - 25311600 records
Key: 2019, Value: Files/MUP_DPR_RY21_P04_V10_DY19_NPIBN_1.csv


SynapseWidget(Synapse.DataFrame, 6acefa25-f750-43dd-b4f7-d90ff5bd1c38)

Writing 2019 data to table - 25401870 records
Key: 2020, Value: Files/MUP_DPR_RY22_P04_V10_DY20_NPIBN_0.csv


SynapseWidget(Synapse.DataFrame, c356928e-95ab-46bc-ab41-be7c3e10a991)

Writing 2020 data to table - 25209729 records
Key: 2021, Value: Files/MUP_DPR_RY23_P04_V10_DY21_NPIBN.csv


SynapseWidget(Synapse.DataFrame, d1ca7d71-415f-44fb-a8b8-e17ff6bacbc0)

Writing 2021 data to table - 25231862 records


In [1]:
df = spark.read.table("cms_provider_drug_costs")

display(df)

StatementMeta(, ff521db1-330e-473c-bf1e-5aaf49ca4be2, 3, Finished, Available)

SynapseWidget(Synapse.DataFrame, 40c4eb10-3a23-43ea-95ee-7ffb92fcb9a5)

In [2]:
print(df.count())

StatementMeta(, ff521db1-330e-473c-bf1e-5aaf49ca4be2, 4, Finished, Available)

223619876


In [None]:
df.printSchema()

StatementMeta(, , , Cancelled, )

In [None]:
%%sql

select sum(Tot_Day_Suply) from cms_provider_drug_costs")

StatementMeta(, , , Cancelled, )

In [None]:
%%sql

DESCRIBE DETAIL cms_provider_drug_costs

StatementMeta(, , , Cancelled, )