In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, hour, to_timestamp, date_format

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Ticket Data Analysis") \
    .getOrCreate()

# Path to the folder containing the four files
file_paths = ["data\filtered_pred_June2024.csv", "data\filtered_pred_July2024.csv", "data\filtered_pred_August2024.csvv", "data\filtered_pred_September2024.csv"]

# Load the data from the files into a single DataFrame
df = spark.read.option("header", "true").csv(file_paths)

# Convert the 'DATE_HOUR' column to a timestamp type
df = df.withColumn("DATE_HOUR", to_timestamp(col("DATE_HOUR"), "yyyy-MM-dd HH"))

# Extract the date and hour from the 'DATE_HOUR' column
df = df.withColumn("DATE", date_format(col("DATE_HOUR"), "yyyy-MM-dd"))
df = df.withColumn("HOUR", date_format(col("DATE_HOUR"), "HH"))

# Group the data by the 'DATE' and 'HOUR' and aggregate the counts of adults and children
df_grouped = df.groupBy("DATE", "HOUR").agg(
    {"NO_OF_ADULT": "sum", "NO_OF_CHILD": "sum"}
)

# Rename the aggregated columns
df_grouped = df_grouped.withColumnRenamed("sum(NO_OF_ADULT)", "TOTAL_ADULTS") \
                       .withColumnRenamed("sum(NO_OF_CHILD)", "TOTAL_CHILDREN")

# Show the result
df_grouped.show()

# Stop the Spark session when done
spark.stop()
