# Event Processing - Part I

This code is used to process some events that were created from some application.

In [None]:
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql import functions as SF
from pyspark.sql.window import Window

In [None]:
spark = SparkSession.builder.appName("EventProcessingPartI").getOrCreate()

In [None]:
spark

## Reading Base

In [None]:
input_file = 'events.json'
output_temp_dir = 'temp'
write_mode='overwrite'

In [None]:
df = spark.read.json(input_file)
df.printSchema()

## Creating New Columns

### Final Event Type

In [None]:
df = df.withColumn("final_event_type", SF.concat(SF.col('domain'),SF.lit('_'), SF.col("event_type")))

### Event Load Date

In [None]:
load_date = datetime.now()

In [None]:
df = df.withColumn("load_date", SF.lit(load_date)) 
df = df.withColumn("year", SF.year('load_date'))
df = df.withColumn("month", SF.month('load_date'))
df = df.withColumn("day", SF.date_format(SF.col("load_date"), "d"))

In [None]:
df.printSchema()

## Casting Data From Structure to String

In [None]:
df = df.withColumn("data", SF.col('data').cast('string')) 

In [None]:
df.printSchema()

## Save temp dirs

Save data patition by final_event_type, year, month, day based on the load date. Each final_event_type partition will be lattaly by the second code.

In [None]:
df.write.partitionBy(['final_event_type','year','month','day'])\
    .mode(write_mode).parquet(output_temp_dir)