In [0]:
# replaced access key with a placeholder for privacy
spark.conf.set(
    "fs.azure.account.key.goodreadsreviews60104758.dfs.core.windows.net",
    "<access-key>"
)

Loading the train data

In [0]:
# Load training split from features_v2
train_path = "abfss://lakehouse@goodreadsreviews60104758.dfs.core.windows.net/Gold_layer/features_v2/train"
train_df = spark.read.format("delta").load(train_path)

# Show first 5 review texts to verify
train_df.select("review_text").show(5, truncate=100)



+----------------------------------------------------------------------------------------------------+
|                                                                                         review_text|
+----------------------------------------------------------------------------------------------------+
|this takes place after ink exchange also. it focuses on niall's and irial's relationship. niall i...|
|love melissa marrs wicked lovely books and this novella did not let me down! very interesting and...|
|                                            review to come after i get some new year's eve dinner ;)|
|again... this was very entertaining to read. gives you adding insight into the side characters li...|
|                                                              irial and niall... what else is there?|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows


Preprocessing the review_text col

In [0]:
from pyspark.sql.functions import col, lower, trim, length, regexp_replace
# Lowercase can review text column : already done in the previous notebook (lab3)
train_df = train_df.withColumn("review_text", lower(col("review_text")))
train_df.select("review_text").show(3, truncate=80)

+--------------------------------------------------------------------------------+
|                                                                     review_text|
+--------------------------------------------------------------------------------+
|this takes place after ink exchange also. it focuses on niall's and irial's r...|
|love melissa marrs wicked lovely books and this novella did not let me down! ...|
|                        review to come after i get some new year's eve dinner ;)|
+--------------------------------------------------------------------------------+
only showing top 3 rows


In [0]:
from pyspark.sql.functions import regexp_replace, trim, col

# Remove punctuation
train_df = train_df.withColumn(
    "review_text",
    trim(regexp_replace(col("review_text"), r"[^\w\s]+", " "))
)

# Replace multiple spaces with a single space
train_df = train_df.withColumn(
    "review_text",
    regexp_replace(col("review_text"), r"\s+", " ")
)

train_df.select("review_text").show(5, truncate=100)


+----------------------------------------------------------------------------------------------------+
|                                                                                         review_text|
+----------------------------------------------------------------------------------------------------+
|this takes place after ink exchange also it focuses on niall s and irial s relationship niall is ...|
|love melissa marrs wicked lovely books and this novella did not let me down very interesting and ...|
|                                               review to come after i get some new year s eve dinner|
|again this was very entertaining to read gives you adding insight into the side characters lives ...|
|                                                                  irial and niall what else is there|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows


In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re

# Function to replace URLs and numbers
def replace_urls_numbers(text):
    if text is None:
        return ""
    # Replace URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE)
    # Replace numbers
    text = re.sub(r"\d+", "<NUM>", text)
    return text

url_num_udf = udf(replace_urls_numbers, StringType())
train_df = train_df.withColumn("review_text", url_num_udf(col("review_text")))
train_df.select("review_text").show(5, truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                         review_text|
+----------------------------------------------------------------------------------------------------+
|this takes place after ink exchange also it focuses on niall s and irial s relationship niall is ...|
|love melissa marrs wicked lovely books and this novella did not let me down very interesting and ...|
|                                               review to come after i get some new year s eve dinner|
|again this was very entertaining to read gives you adding insight into the side characters lives ...|
|                                                                  irial and niall what else is there|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows


In [0]:
import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "emoji", "-q"])


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


0

In [0]:
import emoji

# Function to replace emojis
def replace_emoji(text):
    if text is None:
        return ""
    return emoji.replace_emoji(text, replace="<EMOJI>")

emoji_udf = udf(replace_emoji, StringType())
train_df = train_df.withColumn("review_text", emoji_udf(col("review_text")))
train_df.select("review_text").show(5, truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                         review_text|
+----------------------------------------------------------------------------------------------------+
|this takes place after ink exchange also it focuses on niall s and irial s relationship niall is ...|
|love melissa marrs wicked lovely books and this novella did not let me down very interesting and ...|
|                                               review to come after i get some new year s eve dinner|
|again this was very entertaining to read gives you adding insight into the side characters lives ...|
|                                                                  irial and niall what else is there|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows


In [0]:
# trim the leading and trailing space - already done in lab3
train_df = train_df.withColumn("review_text", trim(col("review_text")))
train_df.select("review_text").show(5, truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                         review_text|
+----------------------------------------------------------------------------------------------------+
|this takes place after ink exchange also it focuses on niall s and irial s relationship niall is ...|
|love melissa marrs wicked lovely books and this novella did not let me down very interesting and ...|
|                                               review to come after i get some new year s eve dinner|
|again this was very entertaining to read gives you adding insight into the side characters lives ...|
|                                                                  irial and niall what else is there|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows


Feature extraction

1. basic text features

In [0]:
from pyspark.sql.functions import size, split
# Create basic text features
train_df = train_df.withColumn("review_length_chars", length(col("review_text")))
train_df = train_df.withColumn("review_length_words", size(split(col("review_text"), " ")))

# Filter out very short reviews (<10 characters)
train_df = train_df.filter(col("review_length_chars") >= 10)
train_df.select("review_text", "review_length_chars", "review_length_words").show(5, truncate=100)

+----------------------------------------------------------------------------------------------------+-------------------+-------------------+
|                                                                                         review_text|review_length_chars|review_length_words|
+----------------------------------------------------------------------------------------------------+-------------------+-------------------+
|                                                  cant wait to read book <NUM> will harry potter die|                 50|                 10|
|this series has now moved into the category of one i have re read so many times i ve lost count i...|                245|                 55|
|                                                                        loved the harry potter books|                 28|                  5|
|                                          this right here is my childhood i owe those books big time|                 58|                 12|

In [0]:
# Install nltk
subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk", "-q"])
import nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


2. Sentiment Features extraction

In [0]:
nltk.download('vader_lexicon', quiet=True)

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyspark.sql.functions import pandas_udf, struct, col
from pyspark.sql.types import StructType, StructField, FloatType
import pandas as pd


In [0]:
# Initialize analyzer
sid = SentimentIntensityAnalyzer()

sentiment_schema = StructType([
    StructField("pos", FloatType(), True),
    StructField("neg", FloatType(), True),
    StructField("neu", FloatType(), True),
    StructField("compound", FloatType(), True)
])

@pandas_udf(sentiment_schema)
def vader_sentiment_udf(texts: pd.Series) -> pd.DataFrame:
    scores = texts.apply(lambda x: sid.polarity_scores(x))
    return pd.DataFrame({
        "pos": scores.apply(lambda x: x['pos']),
        "neg": scores.apply(lambda x: x['neg']),
        "neu": scores.apply(lambda x: x['neu']),
        "compound": scores.apply(lambda x: x['compound'])
    })

train_df = train_df.withColumn("sentiment", vader_sentiment_udf(col("review_text")))

# Extract individual columns
train_df = train_df.withColumn("sentiment_pos", col("sentiment.pos"))
train_df = train_df.withColumn("sentiment_neg", col("sentiment.neg"))
train_df = train_df.withColumn("sentiment_neu", col("sentiment.neu"))
train_df = train_df.withColumn("sentiment_compound", col("sentiment.compound"))

# Drop temporary struct
train_df = train_df.drop("sentiment")


In [0]:
train_df.select(
    "review_text",
    "review_length_words",
    "review_length_chars",
    "sentiment_pos",
    "sentiment_neg",
    "sentiment_neu",
    "sentiment_compound"
).show(5, truncate=100)


+----------------------------------------------------------------------------------------------------+-------------------+-------------------+-------------+-------------+-------------+------------------+
|                                                                                         review_text|review_length_words|review_length_chars|sentiment_pos|sentiment_neg|sentiment_neu|sentiment_compound|
+----------------------------------------------------------------------------------------------------+-------------------+-------------------+-------------+-------------+-------------+------------------+
|fascinating imagination i felt holly thorne more as a bully jim should have been given more chara...|                120|                615|        0.131|        0.108|         0.76|            0.2782|
|this is another one of those books that i think started out well i liked the way it was unfolding...|                176|                834|        0.181|        0.093|        0.726|

3. TF-IDF 

first testing on sample data using sckit-learn

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import FloatType, StructType, StructField

In [0]:
# Sample size for fitting (memory-efficient)
sample_size = 100000
sample_texts = train_df.limit(sample_size).select("review_text").toPandas()["review_text"].fillna("").tolist()

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=500,
    stop_words='english',
    ngram_range=(1, 2)
)

tfidf_vectorizer.fit(sample_texts)


In [0]:
tfidf_schema = StructType([
    StructField("tfidf_mean", FloatType(), True),
    StructField("tfidf_max", FloatType(), True),
    StructField("tfidf_min", FloatType(), True)
])

# Pandas UDF for batch-wise TF-IDF summary
@pandas_udf(tfidf_schema)
def tfidf_features_udf(texts: pd.Series) -> pd.DataFrame:
    texts = texts.fillna("").tolist()
    X = tfidf_vectorizer.transform(texts)
    return pd.DataFrame({
        "tfidf_mean": np.asarray(X.mean(axis=1)).ravel(),
        "tfidf_max": np.asarray(X.max(axis=1).toarray()).ravel(),
        "tfidf_min": np.asarray(X.min(axis=1).toarray()).ravel()
    })

# Apply to Spark DataFrame
train_df = train_df.withColumn("tfidf_features", tfidf_features_udf(col("review_text")))

# Split struct into individual columns
train_df = train_df.withColumn("tfidf_mean", col("tfidf_features.tfidf_mean")) \
                   .withColumn("tfidf_max", col("tfidf_features.tfidf_max")) \
                   .withColumn("tfidf_min", col("tfidf_features.tfidf_min")) \
                   .drop("tfidf_features")


In [0]:
train_df.select(
    "review_text",
    "tfidf_mean",
    "tfidf_max",
    "tfidf_min"
).show(5, truncate=100)


+----------------------------------------------------------------------------------------------------+------------+----------+---------+
|                                                                                         review_text|  tfidf_mean| tfidf_max|tfidf_min|
+----------------------------------------------------------------------------------------------------+------------+----------+---------+
|i remember being surprised to learn that harry would not be attending hogwarts for his final year...|0.0088253785|  0.723066|      0.0|
|the worst of the potter books so far it s far to long in such a way that the reader can point out...| 0.008297608| 0.4296444|      0.0|
|<NUM> <NUM> heartbreaking stars rowling connects every single detail efficiently i can t quite na...|  0.01381674|0.35316837|      0.0|
|loved this one the twist and turns is this chracter good or bad wont know till you read the <NUM>...|0.0056525012| 0.5053352|      0.0|
|                                   amazi

TF-IDF implementing on full dataset

In [0]:
# TFIDF on full dataset
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

tokenizer = Tokenizer(inputCol="review_text", outputCol="words")
words_data = tokenizer.transform(train_df)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2**18)
featurized_data = hashingTF.transform(words_data)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_model = idf.fit(featurized_data)
tfidf_data = idf_model.transform(featurized_data)


üèÉ View run righteous-sloth-126 at: https://adb-2076600740548790.10.azuredatabricks.net/ml/experiments/867946936295008/runs/a21a5c24900b4d51b4b76da39dfd4263
üß™ View experiment at: https://adb-2076600740548790.10.azuredatabricks.net/ml/experiments/867946936295008


In [0]:
tfidf_data.select("review_text", "features").show(5, truncate=100)


+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                                                                         review_text|                                                                                            features|
+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                                  cant wait to read book <NUM> will harry potter die|(262144,[27576,37371,53570,89356,150069,163059,189113,202949,243658,251130],[0.4496663873474542,5...|
|this series has now moved into the category of one i have re read so many times i ve lost count i...|(262144,[19036,19153,19263,21823,25599,29977,30950,34116,42404,53570,55639,68435,6

In [0]:
tfidf_data_renamed = tfidf_data.withColumnRenamed("features", "tfidf_features")


In [0]:
train_df = train_df.join(tfidf_features_df, on="review_text", how="left")

4. Semantic Embedding Features - SBERT

first on sample data - for testing

In [0]:
import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "sentence-transformers", "-q"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn", "-q"])

from sentence_transformers import SentenceTransformer
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import ArrayType, FloatType
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

model_name = "all-MiniLM-L6-v2"  # small & fast (384-dim embeddings)
model = SentenceTransformer(model_name)

embedding_schema = ArrayType(FloatType())

sample_df = train_df.sample(fraction=0.01, seed=42)
sample_texts = sample_df.select("review_text").rdd.map(lambda x: x[0]).collect()

sample_embs = model.encode(sample_texts, batch_size=128, show_progress_bar=True)
sample_embs = np.array(sample_embs, dtype=np.float32)

# Fit PCA to reduce embeddings from 384 -> 128
pca_model = PCA(n_components=128, random_state=42)
pca_model.fit(sample_embs)

@pandas_udf(embedding_schema)
def sbert_embedding_pca_udf(texts: pd.Series) -> pd.Series:
    texts = texts.fillna("")  # handle empty strings
    embeddings = model.encode(texts.tolist(), batch_size=128, show_progress_bar=False)
    embeddings = np.array(embeddings, dtype=np.float32)
    reduced_embeddings = pca_model.transform(embeddings)
    reduced_embeddings = [row.astype(np.float32).tolist() for row in reduced_embeddings]
    return pd.Series(reduced_embeddings)

print("Applying SBERT embeddings with PCA to review_text...")
train_df = train_df.withColumn("bert_embedding", sbert_embedding_pca_udf(col("review_text")))

train_df.select("review_text", "bert_embedding").show(5, truncate=100)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/815 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

üèÉ View run luxuriant-goat-687 at: https://adb-2076600740548790.10.azuredatabricks.net/ml/experiments/1288574647885227/runs/fc936578cce94359824f92c73a9ff47d
üß™ View experiment at: https://adb-2076600740548790.10.azuredatabricks.net/ml/experiments/1288574647885227
Applying SBERT embeddings with PCA to review_text...
+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                                                                         review_text|                                                                                      bert_embedding|
+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|the mixed media art was very fun with this book it is a story of a

In [0]:
tfidf_features_df = tfidf_data_renamed.select("review_text", "tfidf_features")
train_df = train_df.join(tfidf_features_df, on="review_text", how="left")

In [0]:
# Rename features column before joining
tfidf_features_df = tfidf_data.withColumnRenamed("features", "tfidf_features_new")

# Select only necessary columns
tfidf_features_df = tfidf_features_df.select("review_text", "tfidf_features_new")

# Join with train_df
train_df = train_df.join(tfidf_features_df, on="review_text", how="left")

# Optionally, rename to 'tfidf_features' after the join and drop the old column
train_df = train_df.drop("tfidf_features").withColumnRenamed("tfidf_features_new", "tfidf_features")

In [0]:
train_df.select("review_text", "tfidf_features").show(5, truncate=100)

SBERT on Full dataset using batches

In [0]:
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import ArrayType, FloatType
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load Sentence-BERT model on full data
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define a Pandas UDF to embed batches of text
@pandas_udf(ArrayType(FloatType()))
def sbert_embedding(texts: pd.Series) -> pd.Series:
    embeddings = model.encode(texts.tolist(), show_progress_bar=False)
    return pd.Series(embeddings.tolist())

# Apply the embedding UDF to your Spark DataFrame
train_df = train_df.withColumn("sbert_features", sbert_embedding(col("review_text")))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [0]:
train_df.select("review_text", "sbert_features").show(5, truncate=100)


+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                                                                         review_text|                                                                                      sbert_features|
+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|fascinating imagination i felt holly thorne more as a bully jim should have been given more chara...|[-0.07158677, -0.015995646, 0.05530847, 0.037921682, 0.0056503434, 0.029912995, -0.009608623, 0.0...|
|this is another one of those books that i think started out well i liked the way it was unfolding...|[-0.06610921, -0.0182322, 0.04507604, -0.024027187, 0.032871213, 0.080541745, 0.04

combining feature set and output

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Define columns
metadata_cols = ["review_id", "book_id", "rating"]
feature_cols = [
    "sentiment_pos", "sentiment_neg", "sentiment_neu", "sentiment_compound",
    "review_length_words", "review_length_chars", "sbert_features", "tfidf_features"
]

# Combine metadata + features
final_features_df = train_df.select(metadata_cols + feature_cols)

# Save to NEW folder - no schema conflict
gold_path_v3_train = "abfss://lakehouse@goodreadsreviews60104758.dfs.core.windows.net/Gold_layer/features_v3"

final_features_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save(gold_path_v3_train)
