In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, countDistinct, to_date, first, last, lead, lag, lit, element_at, split, max
from pyspark.sql.types import StringType, StructField, IntegerType, StructType, TimestampType

# Initialize Spark session

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("User Click Data ETL") \
    .getOrCreate()

#creating file path

In [3]:
file_path="user_click_data1.json"

In [4]:
schema_df = StructType([
    StructField("browser", StringType(), True),
    StructField("city", StringType(), True),
    StructField("click_event_id", IntegerType(), False),
    StructField("country", StringType(), True),
    StructField("device", StringType(), True),
    StructField("ip_address", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("url", StringType(), True),
    StructField("user_id", StringType(), False)
])

# reading the file path if exists are not

In [5]:
try:
    user_df = spark.read.json(file_path, schema=schema_df)
except Exception as e:
    print(e)

# Correct data types

In [6]:
user_df = user_df.withColumn("event_date", to_date(col("timestamp")))
user_df = user_df.withColumn("user_id", col("user_id").cast(IntegerType()))

In [7]:
from pyspark.sql.window import Window
spec =  Window.partitionBy("country", "event_date", "user_id").orderBy("timestamp")

In [8]:
user_df = user_df.withColumn("last_click", lead(col("timestamp")).over(spec))


# represents a random time spent in minutes

In [9]:
user_df = user_df.withColumn("time_spent", (col("last_click").cast("long") - col("timestamp").cast("long"))/60)

# Group by URL, country, and date and aggregate

In [10]:
agg_df = user_df.groupBy("url", "country", "event_date").agg(
    avg("time_spent").alias("average_minutes_spent"),
    countDistinct("user_id").alias("unique_users_count"),
    count("click_event_id").alias("click_count")
)

In [11]:
agg_df.write.csv("final_result.csv", mode="overwrite")

# Show the result

In [12]:
spark.stop()