In [0]:
sourceFileUrl="/Volumes/workspace/default/lakehouse"
sourceJSONUrl="/Volumes/workspace/default/lakehouse/Json"

### Load JSON Data Without Schema
 _Read the source JSON file without specifying a schema_

In [0]:
sourceJsonfiledf=(spark.
 read.
 json(sourceJSONUrl))

### Define Schema for JSON Data

In [0]:
# Define the schema for the source JSON file using DDL format
sourceJsonfileSchenaDDL = (
    "ARRIVAL_IN_TONNES Decimal(10,2), DATE_OF_PRICING string, MARKET_NAME string, "
    "MAXIMUM_PRICE string, MINIMUM_PRICE string, MODAL_PRICE string, ORIGIN string, "
    "PRODUCTGROUP_NAME string, PRODUCT_NAME string, ROW_ID Integer, STATE_NAME string, VARIETY string"
)

### Load JSON Data With Defined Schema

In [0]:
# Read the source JSON file using the defined schema
sourceJSONFileDf = (
    spark.read.json(
        sourceJSONUrl,
        schema=sourceJsonfileSchenaDDL
    )
)

In [0]:
sourceJSONFileDf.select("PRODUCT_NAME")

### Drop the ROW_ID Column from DataFrame

In [0]:
# Remove the 'ROW_ID' column from sourceJSONFileDf and return a new DataFrame
sourceJSONFileDf.drop("ROW_ID")

### Filter and Sort Product Arrivals by State and Quantity

The following cells demonstrate how to filter products with arrivals greater than 100 tonnes, restrict results to the state "Andhra pradesh", and sort the results by arrival quantity. Each cell builds on the previous by adding sorting or using different sorting functions.

In [0]:
# Select products and arrival quantities, filter for arrivals > 100
(sourceJSONFileDf.
select("PRODUCT_NAME","ARRIVAL_IN_TONNES").
filter("arrival_in_tonnes > 100"))

In [0]:
# Select products and arrival quantities, filter for arrivals > 100, and restrict to "Andhra pradesh"
from pyspark.sql.functions import col
(sourceJSONFileDf.select("PRODUCT_NAME","ARRIVAL_IN_TONNES").
filter("arrival_in_tonnes > 100").where(col("STATE_NAME")=="Andhra Pradesh"))

In [0]:
# Same as above, but sorts results by "ARRIVAL_IN_TONNES" in ascending order
(sourceJSONFileDf.select("PRODUCT_NAME", "ARRIVAL_IN_TONNES")
 .filter("arrival_in_tonnes > 100")
 .where(col("STATE_NAME") == "Andhra Pradesh")
 .sort("ARRIVAL_IN_TONNES"))

In [0]:
# Sorts results by "ARRIVAL_IN_TONNES" in descending order using desc()
from pyspark.sql.functions import desc
(sourceJSONFileDf.select("PRODUCT_NAME", "ARRIVAL_IN_TONNES")
 .filter("arrival_in_tonnes > 100")
 .where(col("STATE_NAME") == "Andhra Pradesh")
 .sort(desc("ARRIVAL_IN_TONNES")).show())

In [0]:
# Display distinct state names from the DataFrame in sorted order
# This code selects the 'STATE_NAME' column, removes duplicate state names,
# sorts them alphabetically, and displays the result using Databricks' rich display.

display(
    sourceJSONFileDf
        .select("STATE_NAME")      # Select only the 'STATE_NAME' column
        .distinct()                # Get unique state names
        .sort("STATE_NAME")        # Sort state names alphabetically
)

In [0]:
# Display the total number of records in the DataFrame
# This code counts the number of rows in 'sourceJSONFileDf' and displays the result using Databricks' rich display.
display(sourceJSONFileDf.count())

In [0]:
# Display the schema and contents of the DataFrame
# This code prints the schema of 'sourceJSONFileDf', showing column names and data types,
# and then displays the DataFrame using Databricks' rich display for easy exploration.

sourceJSONFileDf.printSchema()  # Print the schema of the DataFrame
display(sourceJSONFileDf)       # Display the DataFrame contents

In [0]:
# Display the first row of the DataFrame
# This code retrieves the first record from 'sourceJSONFileDf'.
# Useful for quickly inspecting the structure and sample data of the DataFrame.

(sourceJSONFileDf.
 first()
 )

In [0]:
# Display the first row of the DataFrame
# This code retrieves the first record from 'sourceJSONFileDf' using the 'head()' method.
# Useful for quickly inspecting the structure and sample data of the DataFrame.

(sourceJSONFileDf.
 head()
 )

In [0]:
# Display the first 10 rows of the DataFrame
# This code retrieves the first 10 records from 'sourceJSONFileDf' using the 'take(10)' method.
# Useful for quickly inspecting a sample of the data in the DataFrame.

display(
    sourceJSONFileDf.take(10)
)

In [0]:
# Display the contents of the DataFrame and format the 'ARRIVAL_IN_TONNES' column
# This code first displays the full DataFrame using Databricks' rich display.
# Then, it creates a new DataFrame with the 'ARRIVAL_IN_TONNES' column formatted to two decimal places,
# renaming the column to indicate its decimal format, and displays the result.

display(sourceJSONFileDf)  # Display the original DataFrame

from pyspark.sql.functions import format_number

display(
    sourceJSONFileDf.withColumn(
        "ARRIVAL_IN_TONNES Decimal(10,2)",  # New column name indicating decimal format
        format_number("ARRIVAL_IN_TONNES", 2)  # Format 'ARRIVAL_IN_TONNES' to two decimal places
    )
)