# Read data

In [0]:
# Read data from the Databricks sample table
# This loads the table metadata and prepares a DataFrame
# No actual data is read yet (lazy execution)

df = spark.table("samples.tpch.customer")

# ACTION: Displays schema of the table
df.printSchema()


root
 |-- c_custkey: long (nullable = true)
 |-- c_name: string (nullable = true)
 |-- c_address: string (nullable = true)
 |-- c_nationkey: long (nullable = true)
 |-- c_phone: string (nullable = true)
 |-- c_acctbal: decimal(18,2) (nullable = true)
 |-- c_mktsegment: string (nullable = true)
 |-- c_comment: string (nullable = true)



# TRANSFORMATION


# Select

In [0]:
# TRANSFORMATION: select
# Creates a new DataFrame with only selected columns
# Helps reduce memory usage and improves performance

df_select = df.select(
    "c_custkey",
    "c_name",
    "c_nationkey",
    "c_address",
    "c_phone",
    "c_acctbal",
    "c_mktsegment"

)


# Filter

In [0]:
# TRANSFORMATION: filter
# Keeps only reviews with rating greater than 3

df_filter = df.filter(df.c_acctbal > 3000)

# Where

In [0]:
# TRANSFORMATION: where
# Same as filter, but uses SQL-like syntax

df_where = df.where("df.c_acctbal >= 5000")

# WithColumn

In [0]:
from pyspark.sql.functions import when

# TRANSFORMATION: withColumn
# Creates a new derived column called review_quality

df_with_column = df.withColumn(
    "review_quality",
    when(df.c_acctbal >= 4, "Good").otherwise("Average")
)

#  GroupBY

In [0]:
from pyspark.sql.functions import avg, count

# TRANSFORMATION: groupBy
# Groups data by product category and calculates metrics

df_groupby = df.groupBy("c_mktsegment") \
    .agg(
        avg("c_acctbal").alias("avg_rating"),
        count("c_custkey").alias("review_count")
    )


# join – Combine with Another Dataset

In [0]:
# Create a small lookup DataFrame for sentiment meaning
sentiment_data = [
    ("positive", "High Satisfaction"),
    ("neutral", "Moderate Satisfaction"),
    ("negative", "Low Satisfaction")
]

sentiment_df = spark.createDataFrame(
    sentiment_data,
    ["sentiment", "sentiment_description"]
)

# TRANSFORMATION: join
# Left join to enrich review data with sentiment description

df_join = df.join(
    sentiment_df,
    on="sentiment",
    how="left"
)


# ACTIONS

# show – Display Data

In [0]:
# ACTION: show
# Triggers Spark execution and displays top rows

df_select.show(5, truncate=False)


+---------+------------------+-----------+------------------------------------+---------------+---------+------------+
|c_custkey|c_name            |c_nationkey|c_address                           |c_phone        |c_acctbal|c_mktsegment|
+---------+------------------+-----------+------------------------------------+---------------+---------+------------+
|412445   |Customer#000412445|21         |0QAB3OjYnbP6mA0B,kgf                |31-421-403-4333|5358.33  |BUILDING    |
|412446   |Customer#000412446|20         |5u8MSbyiC7J,7PuY4Ivaq1JRbTCMKeNVqg  |30-487-949-7942|9441.59  |MACHINERY   |
|412447   |Customer#000412447|7          |HC4ZT62gKPgrjr ceoaZgFOunlUogr7GO   |17-797-466-6308|7868.75  |AUTOMOBILE  |
|412448   |Customer#000412448|6          |hJok1MMrDgH                         |16-541-510-4964|6060.98  |MACHINERY   |
|412449   |Customer#000412449|14         |zAt1nZNG01gOhIqgyDtDa S,Y0VSofZJs1dd|24-710-983-5536|4973.84  |HOUSEHOLD   |
+---------+------------------+-----------+------

# count – Count Rows

In [0]:
# ACTION: count
# Executes the query and returns number of rows

total_reviews = df_filter.count()
print(f"Total high-rated reviews: {total_reviews}")


Total high-rated reviews: 477237


# collect – Bring Data to Driver

In [0]:
# ACTION: collect
# Pulls all data to driver memory
# Dangerous for large datasets

sample_rows = df_where.limit(5).collect()
print(sample_rows)


# write – Persist Data to Delta Table

In [0]:
# ACTION: write
# Writes transformed data to a Delta table
# Triggers full execution of transformations

df_with_column.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.default.processed_customer_reviews")
