In [0]:
import pandas  # Import the pandas library for data manipulation and analysis (not used in current Spark workflow)

In [0]:
# Path to the source CSV file containing the data to be loaded into a Spark DataFrame
sourceFileUrl = "/Volumes/workspace/default/lakehouse"

# Path to the target directory where the processed data will be saved in JSON format
targetFileUrl = "/Volumes/workspace/default/lakehouse/Json"

In [0]:
# Read the CSV file from the specified path into a Spark DataFrame
sourceFileDF = (
    spark
    .read                       # Create a DataFrameReader for loading data
    .option("header", "true")   # Specify that the CSV file contains a header row with column names
    .csv(sourceFileUrl)         # Load the CSV file from the given path into a DataFrame
)

`**option("header", "true")**` tells PySpark that the CSV file has a header row, so it uses the actual column names from the file instead of assigning default names like `_c0`, `_c1`, etc.

In [0]:
# Display the DataFrame in a rich tabular format for easy visualization and exploration
display(sourceFileDF)

### Define CSV File Schema
This cell defines the schema for the CSV file using `StructType` and `StructField` from `pyspark.sql.types`.

In [0]:
# Define the schema for the CSV file using StructType and StructField from pyspark.sql.types
from pyspark.sql.types import *

# sourceCSVFileSchema specifies the expected structure and data types for each column in the CSV file
sourceCSVFileSchema = sourceFileSchema = StructType([
  StructField("DATE_OF_PRICING", StringType(), True),         # Date when pricing was recorded
  StructField("ROW_ID", IntegerType(), True),                 # Unique identifier for each row
  StructField("STATE_NAME", StringType(), True),              # Name of the state
  StructField("MARKET_NAME", StringType(), True),             # Name of the market
  StructField("PRODUCTGROUP_NAME", StringType(), True),       # Name of the product group
  StructField("PRODUCT_NAME", StringType(), True),            # Name of the product
  StructField("VARIETY", StringType(), True),                 # Variety of the product
  StructField("ORIGIN", StringType(), True),                  # Origin of the product
  StructField("ARRIVAL_IN_TONNES", DecimalType(10,2), True),  # Arrival quantity in tonnes (decimal)
  StructField("MINIMUM_PRICE", StringType(), True),           # Minimum price recorded
  StructField("MAXIMUM_PRICE", StringType(), True),           # Maximum price recorded
  StructField("MODAL_PRICE", StringType(), True)              # Modal price recorded
])

### Read CSV File into DataFrame
This cell reads the CSV file into a Spark DataFrame using the defined schema and displays the DataFrame.


In [0]:
# Read the CSV file into a Spark DataFrame using the defined schema
sourceCSVFileDF = (
    spark
    .read
    .schema(sourceCSVFileSchema)      # Apply the predefined schema to enforce column names and data types
    .option("header", "true")         # Indicate that the CSV file contains a header row
    .csv(sourceFileUrl)               # Specify the path to the CSV file
)

# Display the DataFrame in a rich tabular format
display(sourceCSVFileDF)

### Print DataFrame Schema
This cell prints the schema of the DataFrame to verify the structure.

In [0]:
# Print the schema of the DataFrame to verify column names and data types
sourceCSVFileDF.printSchema()