In [0]:
import pandas

In [0]:
sourceFileUrl="/Volumes/workspace/default/lakehouse"
targetFileUrl="/Volumes/workspace/default/lakehouse/Json"

In [0]:
sourceFileDF=(spark.
              read.
              option("header", "true").
              csv(sourceFileUrl))


`**option("header", "true")**` tells PySpark that the CSV file has a header row, so it uses the actual column names from the file instead of assigning default names like `_c0`, `_c1`, etc.

In [0]:
display(sourceFileDF)

### Define CSV File Schema
This cell defines the schema for the CSV file using `StructType` and `StructField` from `pyspark.sql.types`.

In [0]:
from pyspark.sql.types import *
sourceCSVFileSchema=sourceFileSchema=StructType([
  StructField("DATE_OF_PRICING", StringType(), True),
  StructField("ROW_ID", IntegerType(), True),
  StructField("STATE_NAME", StringType(), True),
  StructField("MARKET_NAME", StringType(), True),
  StructField("PRODUCTGROUP_NAME", StringType(), True),
  StructField("PRODUCT_NAME", StringType(), True),
  StructField("VARIETY", StringType(), True),
  StructField("ORIGIN", StringType(), True),
  StructField("ARRIVAL_IN_TONNES", DecimalType(10,2), True),
  StructField("MINIMUM_PRICE", StringType(), True),
  StructField("MAXIMUM_PRICE", StringType(), True),
  StructField("MODAL_PRICE", StringType(), True)
  ])

### Read CSV File into DataFrame
This cell reads the CSV file into a Spark DataFrame using the defined schema and displays the DataFrame.


In [0]:
sourceCSVFileDF=(spark.
                   read.
                   schema(sourceCSVFileSchema).
                   option("header", "true").
                   csv(sourceFileUrl))
display(sourceCSVFileDF)

### Print DataFrame Schema
This cell prints the schema of the DataFrame to verify the structure.

In [0]:
sourceCSVFileDF.printSchema()