In [0]:
# Replace with your actual storage account name and key
# Replaced the account key with a placeholder in order to commit to github with public access
spark.conf.set(
"fs.azure.account.key.goodreadsreviews60104758.dfs.core.windows.net",
"<Account-key>"
)


In [0]:
# Read from RAW JSON source, not the processed Parquet
raw_reviews = spark.read.json("abfss://lakehouse@goodreadsreviews60104758.dfs.core.windows.net/raw/reviews/")

df = raw_reviews


In [0]:
# Load the books dataset from the silver layer
books = spark.read.parquet(
"abfss://lakehouse@goodreadsreviews60104758.dfs.core.windows.net/processed/books/"
)
# Load the authors dataset from the silver layer
authors = spark.read.parquet(
"abfss://lakehouse@goodreadsreviews60104758.dfs.core.windows.net/processed/authors/"
)
# Display the first few records to confirm the data was loaded correctly
books.show(5)
authors.show(5)
# Display the columns and their data types to verify the schema
books.printSchema()
authors.printSchema()

+----------+------------------+------------+-------------+----+--------+--------------+-----------+--------------------+--------------------+--------------------+------------------+---------+---------------+-------------+-----------------+-------------------+----------------+--------------------+--------------------+--------+-------------+--------+--------------------+--------------------+
|      isbn|text_reviews_count|country_code|language_code|asin|is_ebook|average_rating|kindle_asin|         description|              format|                link|         publisher|num_pages|publication_day|       isbn13|publication_month|edition_information|publication_year|                 url|           image_url| book_id|ratings_count| work_id|               title|title_without_series|
+----------+------------------+------------+-------------+----+--------+--------------+-----------+--------------------+--------------------+--------------------+------------------+---------+---------------+-------

In [0]:
from pyspark.sql.functions import col, length, trim, count, when
# Read raw (uncleaned) reviews from the silver layer
reviews = spark.read.parquet("abfss://lakehouse@goodreadsreviews60104758.dfs.core.windows.net/processed/reviews/")
# Peek at rows and schema
reviews.show(5, truncate=False)
reviews.printSchema()
# Basic profiling: counts and potential issues
total_rows = reviews.count()
null_review_id = reviews.filter(col("review_id").isNull()).count()
null_book_id = reviews.filter(col("book_id").isNull()).count()
null_user_id = reviews.filter(col("user_id").isNull()).count()
null_rating = reviews.filter(col("rating").isNull()).count()
empty_text = reviews.filter( (col("review_text").isNull()) | (trim(col("review_text")) =="") ).count()
print(f"Total rows: {total_rows}")
print(f"NULL review_id: {null_review_id}, NULL book_id: {null_book_id}, NULL user_id:{null_user_id}, NULL rating: {null_rating}")
print(f"Empty/NULL review_text: {empty_text}")

+--------------------------------+--------+--------------------------------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
from pyspark.sql.functions import col, trim, length
# Start from the existing Parquet-loaded DataFrame
df = reviews
# 1) Drop rows missing critical keys
df = df.filter(col("review_id").isNotNull() &col("book_id").isNotNull() &col("user_id").isNotNull())
# 2) Enforce rating to be integer in [1..5]
df = df.withColumn("rating_int", col("rating").cast("int"))
df = df.filter(col("rating_int").isNotNull() &(col("rating_int") >= 1) &(col("rating_int") <= 5))
# 3) Normalize text; drop empty or ultra-short reviews (<10 chars after trim)
df = df.withColumn("review_text", trim(col("review_text")))
df = df.filter(col("review_text").isNotNull() &(length(col("review_text")) >= 10))
# 4) De-duplicate by review_id (keep arbitrary first; refine if you have timestamps)
df = df.dropDuplicates(["review_id"])
# 5) Select final shape - INCLUDING n_votes and date_added
reviews_clean = df.select(
    "review_id",
    "book_id",
    "user_id",
    col("rating_int").alias("rating"),
    "review_text",
    "n_votes",
    "date_added"
)

In [0]:
# Write the cleaned reviews back to the silver layer (overwrite)
reviews_clean.write.mode("overwrite").parquet("abfss://lakehouse@goodreadsreviews60104758.dfs.core.windows.net/processed/reviews/")


In [0]:
reviews_verified = spark.read.parquet(
    "abfss://lakehouse@goodreadsreviews60104758.dfs.core.windows.net/processed/reviews/"
)
reviews_verified.printSchema()
reviews_verified.show(5, truncate=False)
print(f"Written cleaned rows: {reviews_verified.count()}")


root
 |-- review_id: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- n_votes: long (nullable = true)
 |-- date_added: string (nullable = true)

+--------------------------------+--------+--------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Homewrok Part 1

In [0]:
print("\n=== Books columns ===")
print(books.columns)


=== Books columns ===
['isbn', 'text_reviews_count', 'country_code', 'language_code', 'asin', 'is_ebook', 'average_rating', 'kindle_asin', 'description', 'format', 'link', 'publisher', 'num_pages', 'publication_day', 'isbn13', 'publication_month', 'edition_information', 'publication_year', 'url', 'image_url', 'book_id', 'ratings_count', 'work_id', 'title', 'title_without_series']


In [0]:
book_authors = books.select("book_id", "work_id").withColumnRenamed("work_id", "author_id")


In [0]:
df_gold = (
    reviews_clean
    .join(books.select("book_id", "title", "language_code"), "book_id", "inner")
    .join(book_authors, "book_id", "inner")
    .join(authors.select("author_id", "name"), "author_id", "inner")
    .select(
        "review_id",
        "book_id",
        "title",
        "author_id",
        "name",
        "user_id",
        "rating",
        "review_text",
        "language_code",
        "n_votes",
        "date_added"
    )
)


In [0]:
# Write the curated DataFrame 'df_gold' to the Gold zone as a Delta table.
gold_path = "abfss://lakehouse@goodreadsreviews60104758.dfs.core.windows.net/gold/curated_reviews"

df_gold.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(gold_path)



In [0]:
spark.sql(f"CREATE TABLE IF NOT EXISTS curated_reviews USING DELTA LOCATION '{gold_path}'")


DataFrame[]

In [0]:
spark.sql("SELECT COUNT(*) FROM curated_reviews").show()
spark.sql("SELECT * FROM curated_reviews LIMIT 5").show(truncate=False)


+--------+
|count(1)|
+--------+
|  788770|
+--------+

+--------------------------------+--------+--------------------------------------------------+---------+---------------+--------------------------------+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# Check table exists
spark.sql("SHOW TABLES LIKE 'curated_reviews'").show()

#  query
spark.sql("SELECT COUNT(*) FROM curated_reviews").show()
spark.sql("SELECT * FROM curated_reviews LIMIT 5").show(truncate=False)


+--------+---------------+-----------+
|database|      tableName|isTemporary|
+--------+---------------+-----------+
| default|curated_reviews|      false|
+--------+---------------+-----------+

+--------+
|count(1)|
+--------+
|  788770|
+--------+

+--------------------------------+--------+--------------------------------------------------+---------+---------------+--------------------------------+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
print("Gold table row count:", df_gold.count())

Gold table row count: 788770


In [0]:
# Use Spark SQL to query the Delta table
spark.sql(
    """
    SELECT
        review_id,
        book_id,
        title,
        author_id,
        name,
        user_id,
        rating
    FROM curated_reviews
    LIMIT 10
"""
).show(
    truncate=50
)  # truncate review text for readability

+--------------------------------+--------+--------------------------------------------------+---------+----------------+--------------------------------+------+
|                       review_id| book_id|                                             title|author_id|            name|                         user_id|rating|
+--------------------------------+--------+--------------------------------------------------+---------+----------------+--------------------------------+------+
|3818908d5a98733ba8e0d9340aaa01d3|17282103|                                  هكذا تكلم زرادشت|   196327|      Jody Rosen|b72b9ef6b43e415ee373a737ff3c43d1|     3|
|a04023de2a39a2cabfb956e56cb752e6|   35350|                           What the Body Remembers|    35285| Jimmy McDonough|b9c1edf6bcc9b18198bbedf212c1cee5|     3|
|092beaa2de4ea5a773fbeef45b710a2c|   92146|                    Carved in Bone (Body Farm, #1)|    88874|     Cris Freddi|6e1d7ac0a6e738b23864736a30957eaf|     5|
|ba230623304ed18eed0e93b96ac