In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, isnan
from pyspark.ml.feature import StringIndexer

# Create Spark session
spark = SparkSession.builder \
    .appName("Categorical Transformation and Data Cleaning") \
    .getOrCreate()

Load the dataset
file_path = r'C:\Users\thinu\Desktop\KENULA\top-up\data\Metro_Interstate_Traffic_Volume.csv'
data = spark.read.csv(file_path, header=True, inferSchema=True)

# GroupBy and count
data.groupBy("holiday").count().show()

data.groupBy("weather_main").count().show()

data.groupBy("weather_description").count().show()


+--------------------+-----+
|             holiday|count|
+--------------------+-----+
|                None|48143|
|    Thanksgiving Day|    6|
|        Veterans Day|    5|
|    Independence Day|    5|
|          State Fair|    5|
|        Columbus Day|    5|
|        Memorial Day|    5|
|       New Years Day|    6|
|Martin Luther Kin...|    6|
|Washingtons Birthday|    5|
|       Christmas Day|    6|
|           Labor Day|    7|
+--------------------+-----+

+------------+-----+
|weather_main|count|
+------------+-----+
|Thunderstorm| 1034|
|     Drizzle| 1821|
|         Fog|  912|
|       Clear|13391|
|       Smoke|   20|
|      Squall|    4|
|        Mist| 5950|
|      Clouds|15164|
|        Rain| 5672|
|        Snow| 2876|
|        Haze| 1360|
+------------+-----+

+--------------------+-----+
| weather_description|count|
+--------------------+-----+
|                 fog|  912|
|             drizzle|  651|
|     very heavy rain|   18|
|      shower drizzle|    6|
|proximity showe

In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer

# Step 1: Create Spark session
spark = SparkSession.builder \
    .appName("Categorical Transformation and Data Cleaning") \
    .getOrCreate()

# Step 2: Load the dataset
file_path = r'C:\Users\thinu\Desktop\KENULA\top-up\data\Metro_Interstate_Traffic_Volume.csv'
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Step 3: Identify and Transform Categorical Variables
categorical_columns = ['weather_main', 'holiday', 'weather_description']

# Function to transform and show mapping
def transform_and_show_mapping(data, column):
    indexer = StringIndexer(inputCol=column, outputCol=column + "_index")
    model = indexer.fit(data)
    mapping = dict(enumerate(model.labels))
    print(f"--------------------------------------")
    print(f"Mapping for column '{column}':")
    for index, value in mapping.items():
        print(f"  {index} = {value}")
    # Transform the data
    return model.transform(data)

# Transform all categorical columns
for column in categorical_columns:
    data = transform_and_show_mapping(data, column)

# Step 4: Show a sample of the transformed data
data.select(
    'holiday', 'holiday_index',
    'weather_main', 'weather_main_index',
    'weather_description', 'weather_description_index'
).show(truncate=False)

# Step 5: Export the cleaned and transformed dataset to CSV
output_path = r"C:\Users\thinu\Desktop\big_data_visualization\dataset\categories_dataset.csv"

try:
    data.toPandas().to_csv(output_path, index=False)
    print(f"Cleaned and transformed dataset saved successfully at {output_path}")
except Exception as e:
    print(f"Error saving dataset: {e}")

# Step 6: Stop the Spark session
spark.stop()


--------------------------------------
Mapping for column 'weather_main':
  0 = Clouds
  1 = Clear
  2 = Mist
  3 = Rain
  4 = Snow
  5 = Drizzle
  6 = Haze
  7 = Thunderstorm
  8 = Fog
  9 = Smoke
  10 = Squall
--------------------------------------
Mapping for column 'holiday':
  0 = None
  1 = Labor Day
  2 = Christmas Day
  3 = Martin Luther King Jr Day
  4 = New Years Day
  5 = Thanksgiving Day
  6 = Columbus Day
  7 = Independence Day
  8 = Memorial Day
  9 = State Fair
  10 = Veterans Day
  11 = Washingtons Birthday
--------------------------------------
Mapping for column 'weather_description':
  0 = sky is clear
  1 = mist
  2 = overcast clouds
  3 = broken clouds
  4 = scattered clouds
  5 = light rain
  6 = few clouds
  7 = light snow
  8 = Sky is Clear
  9 = moderate rain
  10 = haze
  11 = light intensity drizzle
  12 = fog
  13 = proximity thunderstorm
  14 = drizzle
  15 = heavy snow
  16 = heavy intensity rain
  17 = snow
  18 = proximity shower rain
  19 = thunderstorm