In [1]:
import os

# For installing requirements (uncomment if there was an error)

# !apt-get install openjdk-11-jdk-headless -qq > /dev/null
# !wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
# !tar -xzf spark-3.3.2-bin-hadoop3.tgz
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

In [2]:
import pkg_resources

# Check if the necessary packages are installed
installed_packages = {pkg.key for pkg in pkg_resources.working_set}

# Install pyspark
if 'pyspark' not in installed_packages:
    !pip install -q pyspark

# Install findspark
if 'findspark' not in installed_packages:
    !pip install -q findspark

# Install kaggle CLI
if 'kaggle' not in installed_packages:
    !pip install -q kaggle

In [3]:
from google.colab import userdata

# Set KAGGLE_USERNAME and KAGGLE_KEY in the Secrets section of Colab
os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')

In [4]:
# Download and unzip the dataset
if not os.path.exists('Books_rating.csv'):
    !kaggle datasets download -d mohamedbakhet/amazon-books-reviews
    !unzip -o amazon-books-reviews.zip
else:
    print("The file 'Books_rating.csv' already exists.")

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
Downloading amazon-books-reviews.zip to /content
 98% 1.04G/1.06G [00:11<00:00, 93.7MB/s]
100% 1.06G/1.06G [00:11<00:00, 98.9MB/s]
Archive:  amazon-books-reviews.zip
  inflating: Books_rating.csv        
  inflating: books_data.csv          


In [5]:
!nproc

2


In [6]:
!cat /proc/cpuinfo | grep "model name"

model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz


In [7]:
!cat /proc/meminfo | grep MemTotal

MemTotal:       13289424 kB


In [8]:
import findspark

# Initialize Spark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("JaccardSimilarity").getOrCreate()

spark.sparkContext.defaultParallelism

2

In [9]:
# Load only the review/text column
df = spark.read.csv("Books_rating.csv", header=True)

# Remove null and duplicate reviews
df = df.select("Id", "Title", "User_id", "review/score", "review/summary", "review/text").dropna().dropDuplicates()
df = df.withColumnRenamed("Id", "review_id")
df = df.withColumnRenamed("Title", "book_title")
df = df.withColumnRenamed("User_id", "user_id")
df = df.withColumnRenamed("review/score", "review_score")
df = df.withColumnRenamed("review/summary", "review_summary")
df = df.withColumnRenamed("review/text", "review_text")

df.cache()

print(f"Number of records: {df.count()}\n")
df.show(5, truncate=False)

Number of records: 2418791

+----------+----------------------------------------------------------------------------------------------------------+--------------+------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
# Sample the dataset
SAMPLE_FRACTION = 0.01
df_sampled = df.sample(withReplacement=False, fraction=SAMPLE_FRACTION, seed=42)

df_sampled.cache()

print(f"Number of samples: {df_sampled.count()}\n")

Number of samples: 24361



In [11]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

# Tokenize
tokenizer = RegexTokenizer(inputCol="review_text", outputCol="text_words", pattern="\\W")
df_with_words  = tokenizer.transform(df_sampled)

df_with_words.cache()
df_with_words.show(5, truncate=False)

+----------+-------------------------------------------------------------------------------------------------------------------------------+--------------+------------+--------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
from pyspark.sql.functions import size

# Remove stopwords
remover = StopWordsRemover(inputCol="text_words", outputCol="tokens")
df_tokenized = remover.transform(df_with_words)

# Only keep rows with non-empty token lists
df_tokenized = df_tokenized.filter(size("tokens") > 0)

df_tokenized.cache()
df_tokenized.show(5, truncate=False)

+----------+-------------------------------------------------------------------------------------------------------------------------------+--------------+------------+--------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
from pyspark.ml.feature import HashingTF, MinHashLSH

# Convert token sets into feature vectors
hashingTF = HashingTF(inputCol="tokens", outputCol="features", numFeatures=10000)
df_hashed = hashingTF.transform(df_tokenized)

# Fit MinHash LSH model
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
model = mh.fit(df_hashed)

# Find similar review pairs (Jaccard distance < 0.6)
similar_pairs = model.approxSimilarityJoin(
    df_hashed, df_hashed, threshold=0.4, distCol="JaccardDistance"
).filter("datasetA.review_id <> datasetB.review_id")  # avoid duplicate pairs

similar_pairs.cache()
similar_pairs.show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
print(f"Number of similar pairs: {similar_pairs.count()}")

Number of similar pairs: 556


In [15]:
# General view
similar_pairs.select(
    "JaccardDistance", "datasetA.*", "datasetB.*"
).orderBy("JaccardDistance").show(10)

+---------------+----------+--------------------+--------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|JaccardDistance| review_id|          book_title|       user_id|review_score|      review_summary|         review_text|          text_words|              tokens|            features|              hashes| review_id|          book_title|       user_id|review_score|      review_summary|         review_text|          text_words|              tokens|            features|              hashes|
+---------------+----------+--------------------+--------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+------