In [None]:
from pyspark.sql.functions import col, max as max_,min as min_,datediff,expr,count, when, lower as lower_,date_format,avg,lit,row_number
from pyspark.sql import SparkSession,Row,Window

In [None]:
#Load spark session
spark = SparkSession.builder \
 .master("yarn") \
 .appName("Task3") \
 .config("spark.executor.instances", "2") \
 .config("spark.executor.cores", "2") \
 .config("spark.executor.memory", "1024M") \
 .getOrCreate()
sc = spark.sparkContext


In [None]:
#Creates a list with the top us politicians in 2020
names = ["Biden","Trump","Pelosi","McConnell","Schumer","McCarthy","President","Congress"]
names = [name.lower() for name in names]

In [None]:
#Loads the dataset
df = spark.read.json("hdfs:/datasets/covid")

In [None]:
#Filter the dataset to tweets located in the US
df2 = df.filter(df.country_code=="US")

In [None]:
#Finds all tweets that contain at least one word of the names list
df2 = df2.where(
    (lower_(col("text")).rlike("|".join(["(" + name + ")" for name in names])))
)

In [None]:
#Formats the date and group the tweets by date, then count the number of tweets by date
df2 = df2.select(date_format('created_at','yyyy-MM-dd').alias('date')).groupby('date').count()

In [None]:
#Creates a window to calculate the moving average over the count column
def days(i):
    return 86400*i

df2 = df2.withColumn('date', df2.date.cast('timestamp'))
W = Window.orderBy(col("date").cast('long')).rangeBetween(-days(6), 0)

df2 = df2.withColumn('7daysMA', avg("count").over(W))
w=Window.orderBy(lit(1))

#Mark the 7 first days to 0
df2 = df2.withColumn("row_id",row_number().over(w))
df2 = df2.withColumn("7daysMA",when(col("row_id") <= 7,0).otherwise(col("7daysMA")))
df2 = df2.select(date_format(df2.date,"yyyy-MM-dd").alias("date"),"7daysMA")

In [None]:
#Writes everything to disk
df2.write.option("header",False).option("delimiter",",").csv("/user/julioferreira/task3")

In [None]:
sc.stop()