
#### Run the cell below to install the required packages for Copilot


## Load CMS Medicare Part D data to Lakehouse

1. Download CMS Dataset files to your local from CMS Website - [Medicare Part D Prescribers - by Provider and Drug](https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug)
2. Upload the files to the Files section of Lakehouse using the web interface or Azure Storage Explorer as per the instructions [here](https://learn.microsoft.com/en-us/fabric/onelake/onelake-azure-storage-explorer)



In [None]:
spark.sql("DROP TABLE IF EXISTS cms_provider_drug_costs")

In [None]:
file_dict = {
                2013 : "Files/cmsfiles/MUP_DPR_RY21_P04_V10_DY13_NPIBN_4.csv",                 
                2014 : "Files/cmsfiles/MUP_DPR_RY21_P04_V10_DY14_NPIBN_1.csv",
                2015 :  "Files/cmsfiles/MUP_DPR_RY21_P04_V10_DY15_NPIBN_1.csv",
                2016 : "Files/cmsfiles/MUP_DPR_RY21_P04_V10_DY16_NPIBN_0.csv",
                2017 : "Files/cmsfiles/MUP_DPR_RY21_P04_V10_DY17_NPIBN_1.csv",
                2018 : "Files/cmsfiles/MUP_DPR_RY21_P04_V10_DY18_NPIBN_0.csv",
                2019 : "Files/cmsfiles/MUP_DPR_RY21_P04_V10_DY19_NPIBN_1.csv",
                2020 : "Files/cmsfiles/MUP_DPR_RY22_P04_V10_DY20_NPIBN_0.csv",
                2021 : "Files/cmsfiles/MUP_DPR_RY23_P04_V10_DY21_NPIBN.csv"
            }

In [None]:
from pyspark.sql.types import LongType, DecimalType
from pyspark.sql.functions import lit, col, concat

for key, v in file_dict.items():
    print(f"Key: {key}, Value: {v}")

    df = spark.read.format("csv").option("header","true").option("inferschema","true").load(v)

    df = df.withColumn("Year", lit(key)) \
        .withColumn("Tot_Drug_Cst", df.Tot_Drug_Cst.cast(DecimalType(10,2))) \
        .withColumn("Tot_30day_Fills", df.Tot_30day_Fills.cast(DecimalType(10,2))) \
        .withColumn("GE65_Tot_30day_Fills", df.GE65_Tot_30day_Fills.cast(DecimalType(10,2))) \
        .withColumn("GE65_Tot_Drug_Cst", df.GE65_Tot_Drug_Cst.cast(DecimalType(10,2))) \
        .withColumn("Prscrbr_City_State", concat(df.Prscrbr_City, lit(", "), df.Prscrbr_State_Abrvtn)) \
        .withColumn("Prscrbr_Full_Name", concat(df.Prscrbr_Last_Org_Name, lit(", "), df.Prscrbr_First_Name)) \
        .withColumn("Tot_Clms", df.Tot_Clms.cast(LongType())) \
        .withColumn("Tot_Day_Suply", df.Tot_Day_Suply.cast(LongType())) \
        .withColumn("Tot_Benes", df.Tot_Benes.cast(LongType())) \
        .withColumn("GE65_Tot_Clms", df.GE65_Tot_Clms.cast(LongType())) \
        .withColumn("GE65_Tot_Benes", df.GE65_Tot_Benes.cast(LongType())) \
        .withColumn("GE65_Tot_Day_Suply", df.GE65_Tot_Day_Suply.cast(LongType()))

    display(df)

    print(f'Writing {key} data to table - {df.count()} records')  
    df.write.mode("append").format('delta').save(f"Tables/cms_provider_drug_costs")


In [None]:
df = spark.read.table("cms_provider_drug_costs")

display(df)

In [None]:
print(df.count())

In [None]:
df.printSchema()

In [None]:
%%sql

select sum(Tot_Day_Suply) from cms_provider_drug_costs

In [None]:
%%sql

DESCRIBE DETAIL cms_provider_drug_costs