##

## Read CMS Provider Drug Cost flat table (Silver Layer) and populate Star Schema Tables (Gold Layer)

This is Notebook is the second step in the sample, it reads the *cms_provider_drug_costs" Delta Parquet table created in the first step as input and generates star schema tables as output.

In [1]:
dim_year_df = spark.sql("SELECT DISTINCT Year, CONCAT(CAST(Year AS String), '-01-01') AS Year_Date_Key FROM cms_provider_drug_costs")
display(dim_year_df)

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 3, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 0156fa55-4447-486c-8b01-5dfc858bc9cc)

In [2]:
from pyspark.sql.types import DateType
from pyspark.sql.functions import col


dim_year_df = dim_year_df.withColumn("Year_Date_Key", col('Year_Date_Key').cast(DateType()))
display(dim_year_df)

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a655536f-3cd9-4712-afef-b92eaab99d07)

In [3]:
dim_year_df.write.mode("overwrite").saveAsTable("cms_provider_dim_year")

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 5, Finished, Available, Finished)

In [4]:
dim_geo_df = spark.sql('''SELECT Prscrbr_City, Prscrbr_City_State, Prscrbr_State_Abrvtn, Prscrbr_State_FIPS, MAX(Year) AS Max_Year, MIN(Year) AS Min_Year,
    row_number() OVER (ORDER BY Prscrbr_State_Abrvtn,Prscrbr_City_State ASC) AS geo_key
    FROM cms_provider_drug_costs
    GROUP BY Prscrbr_City,Prscrbr_City_State,Prscrbr_State_Abrvtn,Prscrbr_State_FIPS ''')

#print(dim_geo_df.count())
display(dim_geo_df)

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 6, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, d361e746-b940-4a66-a055-07b8306d1861)

In [5]:
dim_geo_df.write.mode("overwrite").saveAsTable("cms_provider_dim_geography")

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 7, Finished, Available, Finished)

In [6]:
dim_provider_df = spark.sql('''SELECT Prscrbr_First_Name
,Prscrbr_Full_Name
,Prscrbr_Last_Org_Name
,Prscrbr_NPI
,Prscrbr_Type
,Prscrbr_Type_Src
,MAX(Year) AS Max_Year
,MIN(Year) AS Min_Year
,row_number() OVER (ORDER BY Prscrbr_Full_Name,Prscrbr_NPI,Prscrbr_Type,Prscrbr_Type_Src ASC) AS provider_key
FROM cms_provider_drug_costs
GROUP BY Prscrbr_First_Name,Prscrbr_Full_Name,Prscrbr_Last_Org_Name,Prscrbr_NPI,Prscrbr_Type,Prscrbr_Type_Src''')

#print(dim_provider_df.count())
#display(dim_provider_df)

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 8, Finished, Available, Finished)

In [7]:
dim_provider_df.write.mode("overwrite").saveAsTable("cms_provider_dim_provider")

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 9, Finished, Available, Finished)

In [8]:
dim_drug_df = spark.sql('''SELECT Brnd_Name
,Gnrc_Name
,MAX(Year) AS Max_Year
,MIN(Year) AS Min_Year
,row_number() OVER (ORDER BY Brnd_Name,Gnrc_Name ASC) AS drug_key
FROM cms_provider_drug_costs
GROUP BY Brnd_Name, Gnrc_Name''')

#print(dim_drug_df.count())
#display(dim_drug_df)

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 10, Finished, Available, Finished)

In [9]:
dim_drug_df.write.mode("overwrite").saveAsTable("cms_provider_dim_drug")

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 11, Finished, Available, Finished)

In [10]:
spark.conf.set("spark.sql.caseSensitive", "true")

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 12, Finished, Available, Finished)

In [11]:
df = spark.sql("SELECT * FROM cms_lakehouse.dbo.cms_provider_dim_provider LIMIT 1000")
display(df)

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 459d8ad6-5056-4265-9b36-0d737649f45f)

In [12]:
df = spark.sql('''SELECT Prscrbr_NPI,
Prscrbr_City FROM cms_lakehouse.dbo.cms_provider_drug_costs LIMIT 10''')
display(df)

StatementMeta(, 9c48404d-64e4-4ad1-9b6e-449bb831685a, 14, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 34fe90aa-2391-48bd-9029-94aea3b34e89)

In [1]:
drug_costs_star_df = spark.sql("""SELECT GE65_Bene_Sprsn_Flag
,GE65_Sprsn_Flag
,GE65_Tot_30day_Fills
,GE65_Tot_Benes
,GE65_Tot_Clms
,GE65_Tot_Day_Suply
,GE65_Tot_Drug_Cst
,Tot_30day_Fills
,Tot_Benes
,Tot_Clms
,Tot_Day_Suply
,Tot_Drug_Cst
,Year
,b.drug_key
,c.geo_key
,d.provider_key
FROM cms_provider_drug_costs a
LEFT OUTER JOIN cms_provider_dim_drug b ON a.Brnd_Name = b.Brnd_Name AND a.Gnrc_Name = b.Gnrc_Name
LEFT OUTER JOIN cms_provider_dim_geography c ON a.Prscrbr_City_State IS NOT DISTINCT FROM c.Prscrbr_City_State 
LEFT OUTER JOIN cms_provider_dim_provider d ON a.Prscrbr_Full_Name IS NOT DISTINCT FROM d.Prscrbr_Full_Name AND a.Prscrbr_NPI = d.Prscrbr_NPI AND a.Prscrbr_Type IS NOT DISTINCT FROM d.Prscrbr_Type AND a.Prscrbr_Type_Src = d.Prscrbr_Type_Src""")

#print(drug_costs_star_df.count())
#display(drug_costs_star_df)


StatementMeta(, 4eafe4c5-93a3-46dd-8557-de013a70f757, 3, Finished, Available, Finished)

In [2]:
df = spark.sql("SELECT * FROM cms_lakehouse.dbo.cms_provider_drug_costs LIMIT 1000")
display(df)

StatementMeta(, 4eafe4c5-93a3-46dd-8557-de013a70f757, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 32de1cca-d9cc-43b6-a880-86e724475473)

In [3]:
drug_costs_star_df.write.mode("overwrite").format("delta").saveAsTable("cms_provider_drug_costs_star")

StatementMeta(, 4eafe4c5-93a3-46dd-8557-de013a70f757, 5, Finished, Available, Finished)