In [1]:
import pyspark.sql.functions as func
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession, DataFrame, Window
from typing import Optional

In [2]:
spark = (SparkSession.builder
                     .appName('Recommendations')
                     .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                     .config("spark.kryoserializer.buffer.max", "2047m")
                     .config("spark.driver.memory", "15g")
                     .getOrCreate()
          )

# Set Hadoop configurations to use the service account JSON key
# sc = spark.sparkContext
# sc._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
# sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true")
# sc._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.json.keyfile", f"../keys/{json_key}")

your 131072x1 screen size is bogus. expect trouble
24/04/27 02:02:49 WARN Utils: Your hostname, mainpc resolves to a loopback address: 127.0.1.1; using 172.25.80.73 instead (on interface eth0)
24/04/27 02:02:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/27 02:02:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
def preprocessing_business(spark,
                           city_name:str ='Philadelphia',
                           category:str = 'restaurant',
                           min_star_rating: Optional[int]= None,
                           min_review_count:int =10):

    df = spark.read.csv('./dataset_business.csv', header=True)
    df_filter = df.filter(
        (func.col('is_open')==1) &
        (func.lower(func.col('city'))== city_name.lower()) &
        (func.lower(func.col('categories')).contains(category.lower())) &
        (func.col('review_count') >= min_review_count)
    )
    if min_star_rating is not None:
        df_filter = df_filter.filter(func.col('stars') >= min_star_rating)

    df_select = df_filter.select(
        func.col('business_id'),
        func.col('categories'),
        func.col('name'),
        func.col('review_count'),
        func.col('stars').alias('business_stars')
    )

    string_indexer = StringIndexer(inputCol='business_id', outputCol='business_id_encode')
    model = string_indexer.fit(df_select)
    city_business_num_id = model.transform(df_select)

    # Convert encoded business_id to integer type
    city_business_num_id = city_business_num_id.withColumn(
        'business_id_encode',
        func.col('business_id_encode').cast(IntegerType())
    )

    return city_business_num_id

In [4]:
def preprocess_review(spark,
                      min_review_count: int = 10,
                      min_star_rating: Optional[int] = None,
                      ) -> DataFrame:

    df_reviews = spark.read.json('./slim_review.json')

    # Group by user_id to calculate review counts per user
    df_user_review_counts = df_reviews.groupBy("user_id").agg(
        func.count("review_id").alias("user_review_count")
    ).filter(func.col("user_review_count") >= min_review_count)

    # Join back to the original reviews to filter users by their review counts
    df_filtered_reviews = df_reviews.join(
        df_user_review_counts, "user_id"
    )

    # Optionally filter by star rating
    if min_star_rating is not None:
        df_filtered_reviews = df_filtered_reviews.filter(
            func.col("stars") >= min_star_rating
        )

    # Select and rename columns to match the SQL query
    df_final = df_filtered_reviews.select(
        func.col("user_id"),
        func.col("business_id"),
        func.col("date"),
        func.col("review_id"),
        func.col("stars").alias("user_stars"),
        func.col("text").alias("user_reviews")
    )

    # Encode the user_id using StringIndexer
    string_indexer = StringIndexer(inputCol='user_id', outputCol='user_id_encode')
    model = string_indexer.fit(df_final)
    user_reviews_num_id = model.transform(df_final)

    # Convert encoded user_id to integer type
    user_reviews_num_id = user_reviews_num_id.withColumn(
        'user_id_encode',
        func.col('user_id_encode').cast(IntegerType())
    )

    return user_reviews_num_id

In [5]:
reviews = preprocess_review(spark=spark)
businesses = preprocessing_business(spark=spark)

business_user_review = reviews.join(businesses,
                                    on='business_id',
                                    how='inner')

                                                                                

In [6]:
businesses.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- business_stars: string (nullable = true)
 |-- business_id_encode: integer (nullable = true)



In [7]:
reviews.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- user_stars: long (nullable = true)
 |-- user_reviews: string (nullable = true)
 |-- user_id_encode: integer (nullable = true)



In [8]:
reviews.show(10)

[Stage 15:>                                                       (0 + 12) / 12]

+--------------------+--------------------+-------------------+--------------------+----------+--------------------+--------------+
|             user_id|         business_id|               date|           review_id|user_stars|        user_reviews|user_id_encode|
+--------------------+--------------------+-------------------+--------------------+----------+--------------------+--------------+
|-Cw2rJx6v8gHgWOBX...|6_T2xzR74JqGCTPef...|2011-08-16 00:08:57|bZBN5lFvhz1UebCmj...|         5|FRESH FRESH FRESH...|          5444|
|-Cw2rJx6v8gHgWOBX...|PP3BBaVxZLcJU54uP...|2011-08-29 15:15:25|iZGzgccv0AXwEidKw...|         4|How could you not...|          5444|
|-Cw2rJx6v8gHgWOBX...|hX_8ZUmIqWFWzjdiP...|2012-01-20 19:50:08|WlxKsYqEtaGYnAdoT...|         4|Ordered the pigno...|          5444|
|-Cw2rJx6v8gHgWOBX...|ntiIq1FNqduOyyowM...|2011-08-16 00:10:12|rYV5irskfexITtv-p...|         5|affordable and ve...|          5444|
|-Cw2rJx6v8gHgWOBX...|IWHdx0NhDKADkGOgX...|2011-08-16 01:03:23|MveqEVKQQYy-V

                                                                                

In [9]:
business_user_review.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- user_stars: long (nullable = true)
 |-- user_reviews: string (nullable = true)
 |-- user_id_encode: integer (nullable = true)
 |-- categories: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- business_stars: string (nullable = true)
 |-- business_id_encode: integer (nullable = true)



In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer, HashingTF, IDF, Normalizer
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors

In [11]:
def train_test_split(
    df_reviews: DataFrame,
    train_perc: int = 0.7) -> DataFrame:
    
    #Random shuffle
    shuffled_final = df_reviews.orderBy(func.rand(42))

    # Define window specification partitioned by the user_id column and corresponding row number
    window_spec = Window.partitionBy("user_id").orderBy("user_id")
    final_reviews_with_row_number = shuffled_final.withColumn("row_number", func.row_number().over(window_spec))

    # Calculate the total number of rows for each user
    user_counts = final_reviews_with_row_number.groupBy("user_id").count()

    # Calculate the number of rows to include in the training set for each user
    user_test_counts = user_counts.withColumn("test_count", (func.col("count") * (1-train_perc)).cast("int"))

    #Join row number data with test count
    final_with_test_counts = final_reviews_with_row_number.join(user_test_counts, "user_id", "inner")

    # Create training and testing DataFrames for each user
    train_data = final_with_test_counts.filter(func.col("row_number") <= (func.col("count") - func.col("test_count")))
    test_data = final_with_test_counts.filter(func.col("row_number") > (func.col("count") - func.col("test_count")))

    # Drop the intermediate columns
    train_data = train_data.drop("row_number", "count", "test_count")
    test_data = test_data.drop("row_number", "count", "test_count")
    
    return train_data, test_data

In [12]:
(train, test) = train_test_split(df_reviews=business_user_review)

In [13]:
# initial train test datasets size
train.count(), test.count()

                                                                                

(142445, 56510)

In [14]:
# make ALS model
# best rank: 14
# best regParam: 0.19
als = ALS(userCol='user_id_encode',
          itemCol='business_id_encode',
          ratingCol='user_stars',
          coldStartStrategy='drop',
          nonnegative=True,
          rank=14,
          regParam=0.19
          )

evaluator = RegressionEvaluator(
    metricName='rmse',
    labelCol='user_stars',
    predictionCol='prediction'
)

In [15]:
als_params = ParamGridBuilder().addGrid(als.rank, [12,13,14]) \
                               .addGrid(als.regParam, [0.17,0.18,0.19]) \
                               .build()        
cv = CrossValidator(
        estimator=als,
        estimatorParamMaps=als_params,
        evaluator=evaluator
    )

In [16]:
# model = cv.fit(train)

In [17]:
model = als.fit(train)

24/04/27 02:03:19 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [18]:
predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)
print(rmse)

                                                                                

1.0269023512011684


                                                                                

In [19]:
user_rec = model.recommendForAllUsers(5)
# https://github.com/apache/spark/blob/master/examples/src/main/python/ml/als_example.py

In [20]:
# user_rec.count()

In [21]:
# test.printSchema()

In [22]:
# filtered_user_rec = user_rec.withColumn(
#     "filtered_recommendations",
#     func.expr("filter(recommendations, item -> item.rating > 4.0)")
# )

# non_empty_user_rec = filtered_user_rec.filter(
#     func.size(func.col("filtered_recommendations")) > 0
# )
# # Display the filtered results
# non_empty_user_rec.select("user_id_encode", "filtered_recommendations").show(10, truncate=False)

In [23]:
# test.filter((func.col('user_id_encode')==928) & (func.col('business_id_encode')==1209)).show()

In [24]:
# preprocess_review(spark=spark).show(5)

In [25]:
# preprocessing_business(spark=spark, min_review_count=3).show(5)

In [26]:
# process text reviews by user
def review_text_processing(df_train: DataFrame,
                           df_test: DataFrame,
                           reivew_col: str = 'user_reviews',
                           rating_cut_off: float = 0) -> DataFrame:

    df_train_lower = df_train.withColumn(reivew_col, func.lower(reivew_col))
    df_test_lower = df_test.withColumn(reivew_col, func.lower(reivew_col))
    
    
    # aggregate reviews by business in train
    df_train_agg = df_train_lower.groupBy('business_id', 'name').agg(
        func.concat_ws(' ', func.collect_list('user_reviews')).alias(reivew_col),
        func.mean('business_stars').alias('business_stars'),
    ).filter(
        func.col('business_stars') >= rating_cut_off
    )
    # aggregate reviews by user
    df_test_agg = df_test_lower.groupBy('user_id').agg(
        func.concat_ws(' ', func.collect_list('user_reviews')).alias(reivew_col)
    )
    # Define the regex tokenizer
    regex_tokenizer = RegexTokenizer(
        inputCol=reivew_col,
        outputCol='words',
        pattern="\\W"  # This regex splits the text at any non-word character
    )
    # Define the stopwords remover
    stopwords_remover = StopWordsRemover(inputCol='words', outputCol='filtered_words')

    # tf-idf
    hashing_tf = HashingTF(inputCol='filtered_words',
                           outputCol='raw_features', numFeatures=100)
    idf = IDF(inputCol='raw_features', outputCol='features')
    # Define the pipeline with the stages
    pipeline = Pipeline(stages=[regex_tokenizer, stopwords_remover, hashing_tf, idf])

    # Fit the pipeline to the data and transform the data
    model = pipeline.fit(df_train_agg)
    train_transformed = model.transform(df_train_agg)
    test_transformed = model.transform(df_test_agg)

    # remove extra columns
    train_transformed = train_transformed.drop(
        'date',
        # 'user_id',
        'categories',
        'review_count',
        # 'business_id',
        'words',
        'filtered_words',
        'raw_features',
        reivew_col
    )
    test_transformed = test_transformed.drop(
        'date',
        # 'user_id',
        'categories',
        'review_count',
        # 'business_id',
        'words',
        'filtered_words',
        'raw_features',
        reivew_col
    )

    for col in train_transformed.columns:
        if col != 'features':
            train_transformed = train_transformed.withColumnRenamed(col, 'train_'+col)

    for col in test_transformed.columns:
        if col != 'features':
            test_transformed = test_transformed.withColumnRenamed(col, 'test_'+col)

    return train_transformed, test_transformed

In [27]:
# df_train.show(5)

In [28]:
def cosine_similarity(features1, features2):
    return float(float(features1.dot(features2)) / (Vectors.norm(features1, 2) * Vectors.norm(features2, 2)))

cosine_similarity_udf = func.udf(cosine_similarity, FloatType())

def cosine_recommendation(train_transformed: DataFrame,
                          test_transformed: DataFrame,
                          sim_cut_off: float = 0.5) -> DataFrame:

    normalizer = Normalizer(inputCol='features', outputCol='norm_features', p=2.0)
    train_normalized = normalizer.transform(train_transformed).withColumnRenamed("norm_features", "train_norm_features")
    test_normalized = normalizer.transform(test_transformed).withColumnRenamed("norm_features", "test_norm_features")


    # Perform a Cartesian join to calculate cosine similarity between every test and train pair
    cartesian_df = test_normalized.crossJoin(train_normalized)
    
    result_df = cartesian_df.withColumn(
        'similarity',
        cosine_similarity_udf(cartesian_df['test_norm_features'], cartesian_df['train_norm_features'])
    )
    result_df = result_df.filter(
        func.col('similarity') >= sim_cut_off
    )
    return result_df

In [29]:
def get_hit_rate(ground_truth_df, # test data
                 recommended_df, # result_df
                 user_col, # user_id
                 item_col # business_id
                 ):
    # Join ground truth and recommended items on user ID
    joined_df = ground_truth_df.join(recommended_df, on=user_col)

    # Filter to count hits (correct recommendations) for each user
    hits_df = joined_df.filter(func.col("recommended_" + item_col) == func.col(item_col)) \
                       .select(user_col).distinct()

    # Count the distinct users with at least one correct recommendation
    users_with_hits = hits_df.select(func.countDistinct(user_col)).collect()[0][0]

    # Count the total number of distinct users for whom recommendations were made
    total_users = recommended_df.select(func.countDistinct(user_col)).collect()[0][0]

    # Calculate total hit rate
    total_hit_rate = users_with_hits / total_users

    return total_hit_rate

def get_recall(ground_truth_df,
               recommended_df,
               user_col,
               item_col):
    # Join ground truth and recommended items on user ID
    joined_df = ground_truth_df.join(recommended_df, on=user_col)

    # Count the number of relevant recommended items and total relevant items for each user
    relevant_recommended_items = joined_df.filter(func.col("recommended_"+item_col) == func.col(item_col))
    total_relevant_items = ground_truth_df.groupBy(user_col).count()

    # Compute recall for each user
    recall_df = relevant_recommended_items.groupBy(user_col).count().withColumnRenamed("count", "relevant_count") \
        .join(total_relevant_items, on=user_col)
    recall_df = recall_df.withColumn("recall", func.col("relevant_count") / func.col("count"))

    # Average recall across all users
    average_recall = recall_df.selectExpr("avg(recall)").collect()[0][0]

    return average_recall

In [30]:
df_train, df_test = review_text_processing(
                        df_train=train,
                        df_test=test,
                        rating_cut_off=4)  # only 4-star or above restaurant will be recommended

                                                                                

In [31]:
df_test.count()

7725

In [32]:
result_df = cosine_recommendation(train_transformed=df_train, test_transformed=df_test, sim_cut_off=0.7)

In [33]:
result_df.printSchema()

root
 |-- test_user_id: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- test_norm_features: vector (nullable = true)
 |-- train_business_id: string (nullable = true)
 |-- train_name: string (nullable = true)
 |-- train_business_stars: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- train_norm_features: vector (nullable = true)
 |-- similarity: float (nullable = true)



In [34]:
# result_df.select("test_user_id_encode").distinct().count()

In [35]:
result_df.select('test_user_id', 'train_business_id', 'similarity').show(20)

24/04/27 02:03:35 WARN ExtractPythonUDFFromJoinCondition: The join condition:(cosine_similarity(test_norm_features#1222, train_norm_features#1206)#1314 > 0.7) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.
[Stage 357:>                                                        (0 + 1) / 1]

+--------------------+--------------------+----------+
|        test_user_id|   train_business_id|similarity|
+--------------------+--------------------+----------+
|-Cw2rJx6v8gHgWOBX...|1vPSY4EA-fTRIZYz1...|0.73268765|
|-Cw2rJx6v8gHgWOBX...|2gFPQCmKKVi1aHgGT...| 0.7082998|
|-Cw2rJx6v8gHgWOBX...|4zQV6v8TwEYMwI9Ek...|0.71370053|
|-Cw2rJx6v8gHgWOBX...|5R3-eCIk4dRBtXo0A...| 0.7614566|
|-Cw2rJx6v8gHgWOBX...|9Q_dbzylYiWdF11lH...|0.70663893|
|-Cw2rJx6v8gHgWOBX...|CpBuc0aSueBpTXce3...|0.71562934|
|-Cw2rJx6v8gHgWOBX...|EsQZIf_5Typ-plWtO...|0.71205634|
|-Cw2rJx6v8gHgWOBX...|FIFi_8eNmc-jPHZVH...| 0.7016219|
|-Cw2rJx6v8gHgWOBX...|HFuAKE0uZ-frIMjro...|0.70307285|
|-Cw2rJx6v8gHgWOBX...|JVDHxMnKjif8XdXVF...|0.70583504|
|-Cw2rJx6v8gHgWOBX...|QJOC6Uz-RCpzPB6aM...|0.71362644|
|-Cw2rJx6v8gHgWOBX...|Uky0DD3LU4C7eyNDh...|0.77991164|
|-Cw2rJx6v8gHgWOBX...|VRnJgj0IvxoksHppE...|0.70383286|
|-Cw2rJx6v8gHgWOBX...|Wfg85U89qt1OA7aqw...|0.72462535|
|-Cw2rJx6v8gHgWOBX...|Yu_QofgDAjn__QsMi...|0.72924453|
|-Cw2rJx6v

                                                                                

In [36]:
7247/14266

0.507991027618113

In [37]:
result_df.printSchema()

root
 |-- test_user_id: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- test_norm_features: vector (nullable = true)
 |-- train_business_id: string (nullable = true)
 |-- train_name: string (nullable = true)
 |-- train_business_stars: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- train_norm_features: vector (nullable = true)
 |-- similarity: float (nullable = true)



In [39]:
# recall calculation

# find distinct user-restaurant pair (total restaurants user has visit)
total_visits_pair = test.select('user_id', 'business_id').distinct()
# recommend user-restaurant pair
rec_visits_pair = result_df.select('test_user_id', 'train_business_id').distinct()

# recommend user-restaurant pair in test user-restaurant pair
actual_vist = total_visits_pair.join(
                rec_visits_pair,
                (total_visits_pair['user_id'] == rec_visits_pair['test_user_id']) &
                (total_visits_pair['business_id']==rec_visits_pair['train_business_id']),
                how='inner'
            )

total_visits = total_visits_pair.count()
recall = actual_vist.count() / total_visits

print(recall)

24/04/27 02:05:19 WARN ExtractPythonUDFFromJoinCondition: The join condition:(cosine_similarity(test_norm_features#1222, train_norm_features#1206)#1314 > 0.7) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.
[Stage 439:====>                                                  (1 + 12) / 13]

0.5453596746914247


                                                                                