## 3- Data preparation

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

destination_path = '/FileStore/tables/Covid-USA/bronze/daily_cases_bronze'
spark = SparkSession.builder.appName("COVID Tracking Data - Data prep").getOrCreate()
df = spark.read.format("parquet").load(destination_path)

state_dict = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado',
    'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana',
    'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota',
    'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina',
    'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania',
    'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas',
    'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
    'WI': 'Wisconsin', 'WY': 'Wyoming'
}
state_cond = [when(col('state') == k, v).otherwise(None).alias(v) for k, v in state_dict.items()]

df = df.select(['date', *state_cond, col('positive'), col('death'), col('totalTestsViral')])
df = df.withColumn('state', coalesce(*state_dict.values())) 
df = df.drop(*state_dict.keys())

df = df.withColumn("year", year("date"))
df_filtered = df.filter(col('state').isNotNull())
df_grouped = df_filtered.groupBy("state", "year") \
               .agg(sum("positive").alias("total_positive_cases"), 
                    sum("death").alias("total_deaths"))
df_ordered = df_grouped.orderBy("state", "year")
df_ordered = df_ordered.withColumn("total_positive_cases", col("total_positive_cases").cast("integer"))
df_ordered = df_ordered.withColumn("total_deaths", col("total_deaths").cast("integer"))

destination_path = '/FileStore/tables/Covid-USA/silver/daily_cases_silver'
df_ordered.write.mode('overwrite').format("parquet").save(destination_path)
df_ordered.show(10)
spark.close()