In [None]:
# Importing necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Imputer
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models



In [None]:
#Creating a SparkSession
spark = SparkSession.builder \
    .appName("TrafficVolumeAnalysis") \
    .getOrCreate()

In [None]:
#Loading the dataset into a PySpark DataFrame
file_path = "file:///home/hduser/Desktop/SEM2_CA1/da_sem2_ca1_rep/dataset.csv"
df = spark.read.csv(file_path, header=True)

In [None]:
# Displaying schema
df.printSchema()

In [None]:
# Displaying first few rows
df.show(5)

In [None]:
# Summary statistics
df.describe().show()

In [None]:
# Dropping columns 
columns_to_drop = ['STATISTICS', 'TLIST(A1)', 'C02875V03459', 'UNIT']
df = df.drop(*columns_to_drop)

In [None]:
# Displaying first few rows
df.show(5)

In [None]:
# Summary statistics
df.describe().show()

In [None]:
#Filtering thr rows as required
df = df.filter(df['Statistic Label'] == "Traffic Volume")

In [None]:
df.show()

In [None]:
# Checking for missing values
missing_counts = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])
missing_counts.show()

In [None]:
# Extracting values for plotting
years = df.select("Year").rdd.flatMap(lambda x: x).collect()
traffic_volumes = df.select("VALUE").rdd.flatMap(lambda x: x).collect()

In [None]:
# Plotting traffic volume over the years with logarithmic scale on Y-axis
plt.figure(figsize=(12, 6))
plt.plot(years, traffic_volumes, marker='o', linestyle='-')
plt.yscale('log')  # Set logarithmic scale for Y-axis
plt.title("Traffic Volume Over the Years")
plt.xlabel("Year")
plt.ylabel("Traffic Volume")
plt.grid(True)
plt.show()