In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, stddev, skewness, kurtosis

# Step 1: Start Spark Session
spark = SparkSession.builder \
    .appName("Summary Statistics for Traffic Volume") \
    .getOrCreate()

# Step 2: Load the dataset
file_path =  r'C:\Users\thinu\Desktop\KENULA\top-up\data\cleaned_transformed_dataset2.csv'

# Ensure the file exists and is accessible
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Step 3: Create 'reduced_df' with non-null 'traffic_volume'
reduced_df = data.select("traffic_volume").filter(data["traffic_volume"].isNotNull())

# Step 4: Compute summary statistics
summary_df = reduced_df.select(
    mean("traffic_volume").alias("Mean"),
    stddev("traffic_volume").alias("StdDev"),
    skewness("traffic_volume").alias("Skewness"),
    kurtosis("traffic_volume").alias("Kurtosis")
)

# Step 5: Show the results
summary_df.show()

# Step 6: Stop Spark Session
spark.stop()



+-----------------+------------------+--------------------+-------------------+
|             Mean|            StdDev|            Skewness|           Kurtosis|
+-----------------+------------------+--------------------+-------------------+
|3259.618133521489|1986.9544648251451|-0.08905846571674847|-1.3091559302917255|
+-----------------+------------------+--------------------+-------------------+



In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, stddev, skewness, kurtosis

# Step 1: Start Spark Session
spark = SparkSession.builder \
    .appName("Summary Statistics for temp") \
    .getOrCreate()

# Step 2: Load the dataset
file_path =  r'C:\Users\thinu\Desktop\KENULA\top-up\data\cleaned_transformed_dataset2.csv'

# Ensure the file exists and is accessible
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Step 3: Create 'reduced_df' with non-null 'traffic_volume'
reduced_df = data.select("temp").filter(data["temp"].isNotNull())

# Step 4: Compute summary statistics
summary_df = reduced_df.select(
    mean("temp").alias("Mean"),
    stddev("temp").alias("StdDev"),
    skewness("temp").alias("Skewness"),
    kurtosis("temp").alias("Kurtosis")
)

# Step 5: Show the results
summary_df.show()

# Step 6: Stop Spark Session
spark.stop()


+-----------------+-----------------+-------------------+-----------------+
|             Mean|           StdDev|           Skewness|         Kurtosis|
+-----------------+-----------------+-------------------+-----------------+
|281.2049945005903|13.33873796577709|-2.2474097668758093|39.91830115843366|
+-----------------+-----------------+-------------------+-----------------+



In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, stddev, skewness, kurtosis

# Step 1: Start Spark Session
spark = SparkSession.builder \
    .appName("Summary Statistics for snow_1h") \
    .getOrCreate()

# Step 2: Load the dataset
file_path =  r'C:\Users\thinu\Desktop\KENULA\top-up\data\cleaned_transformed_dataset2.csv'

# Ensure the file exists and is accessible
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Step 3: Create 'reduced_df' with non-null 'traffic_volume'
reduced_df = data.select("snow_1h").filter(data["snow_1h"].isNotNull())

# Step 4: Compute summary statistics
summary_df = reduced_df.select(
    mean("snow_1h").alias("Mean"),
    stddev("snow_1h").alias("StdDev"),
    skewness("snow_1h").alias("Skewness"),
    kurtosis("snow_1h").alias("Kurtosis")
)

# Step 5: Show the results
summary_df.show()

# Step 6: Stop Spark Session
spark.stop()


+--------------------+-------------------+------------------+-----------------+
|                Mean|             StdDev|          Skewness|         Kurtosis|
+--------------------+-------------------+------------------+-----------------+
|2.224666403801857...|0.00816905077483839|48.357440005447636|2619.268300755711|
+--------------------+-------------------+------------------+-----------------+



In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, stddev, skewness, kurtosis

# Step 1: Start Spark Session
spark = SparkSession.builder \
    .appName("Summary Statistics for rain_1h") \
    .getOrCreate()

# Step 2: Load the dataset
file_path =  r'C:\Users\thinu\Desktop\KENULA\top-up\data\cleaned_transformed_dataset2.csv'

# Ensure the file exists and is accessible
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Step 3: Create 'reduced_df' with non-null 'traffic_volume'
reduced_df = data.select("rain_1h").filter(data["rain_1h"].isNotNull())

# Step 4: Compute summary statistics
summary_df = reduced_df.select(
    mean("rain_1h").alias("Mean"),
    stddev("rain_1h").alias("StdDev"),
    skewness("rain_1h").alias("Skewness"),
    kurtosis("rain_1h").alias("Kurtosis")
)

# Step 5: Show the results
summary_df.show()

# Step 6: Stop Spark Session
spark.stop()


+------------------+-----------------+------------------+-----------------+
|              Mean|           StdDev|          Skewness|         Kurtosis|
+------------------+-----------------+------------------+-----------------+
|0.3343818872309983|44.79703269348369|219.34351945262026|48133.64919028338|
+------------------+-----------------+------------------+-----------------+



In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, stddev, skewness, kurtosis

# Step 1: Start Spark Session
spark = SparkSession.builder \
    .appName("Summary Statistics for clouds_all") \
    .getOrCreate()

# Step 2: Load the dataset
file_path =  r'C:\Users\thinu\Desktop\KENULA\top-up\data\cleaned_transformed_dataset2.csv'

# Ensure the file exists and is accessible
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Step 3: Create 'reduced_df' with non-null 'traffic_volume'
reduced_df = data.select("clouds_all").filter(data["clouds_all"].isNotNull())

# Step 4: Compute summary statistics
summary_df = reduced_df.select(
    mean("clouds_all").alias("Mean"),
    stddev("clouds_all").alias("StdDev"),
    skewness("clouds_all").alias("Skewness"),
    kurtosis("clouds_all").alias("Kurtosis")
)

# Step 5: Show the results
summary_df.show()

# Step 6: Stop Spark Session
spark.stop()


+-----------------+------------------+--------------------+-------------------+
|             Mean|            StdDev|            Skewness|           Kurtosis|
+-----------------+------------------+--------------------+-------------------+
|49.36545126278872|39.015212752422116|-0.19738024536278448|-1.7421532908058215|
+-----------------+------------------+--------------------+-------------------+

