In [1]:
from pyspark.ml.recommendation import ALSModel
from pyspark.sql import SparkSession
from pyspark.sql import Row
import pandas as pd
import os
import sys

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
# Dynamic path settings
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
TRAIN_TEST_SPLIT_DIR = os.path.join(BASE_DIR, "train_test_split")

In [4]:
spark = SparkSession.builder \
    .appName("RecommenderSystem") \
    .master("local[*]") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.cores", "4") \
    .config("spark.python.worker.reuse", "false") \
    .config("spark.local.dir", "/tmp/spark-temp") \
    .config("spark.network.timeout", "600s") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .getOrCreate()

In [5]:
# Load test data
train_sales_path = os.path.join(TRAIN_TEST_SPLIT_DIR, "train_sales_data.csv")
test_sales_path = os.path.join(TRAIN_TEST_SPLIT_DIR, "test_sales_data.csv")
train_df = pd.read_csv(train_sales_path)
test_df = pd.read_csv(test_sales_path)

In [6]:
# Check common user and product IDs
common_user_ids = set(train_df['user_id']).intersection(set(test_df['user_id']))
common_product_ids = set(train_df['product_id']).intersection(set(test_df['product_id']))

print(f"Common user IDs: {len(common_user_ids)}")
print(f"Common product IDs: {len(common_product_ids)}")

if len(common_user_ids) == 0 or len(common_product_ids) == 0:
    raise ValueError("No common user IDs or product IDs between training and test sets.")

Common user IDs: 1148
Common product IDs: 1148


In [7]:
# Ensure there are data rows in test data
if test_df.empty:
    raise ValueError("Test DataFrame is empty.")
print("Test DataFrame:\n", test_df.head(), "\n")

Test DataFrame:
    user id                        product id Interaction type   
0      3.0  2c55cae269aebf53838484b0d7dd931a             like  \
1      7.0  40d3cd16b41970ae6872e914aecf2c8e         purchase   
2      8.0  bc178f33a04dbccefa95b165f8b56830             view   
3      9.0  cc2083338a16c3fe2f7895289d2e98fe             like   
4     14.0  82c86a4d24dce5e14303033d7b658b78             view   

            Time stamp  Unnamed: 4  user_id  product_id  interaction_type  
0  2023-10-12 08:00:00         NaN        2         488                 2  
1  2023-10-16 08:00:00         NaN        6         719                 3  
2  2023-10-17 08:00:00         NaN        7        2138                 1  
3  2023-10-18 08:00:00         NaN        8        2300                 2  
4  2023-10-23 08:00:00         NaN       13        1438                 1   



In [8]:
# Convert interaction_type column to numeric
test_df['interaction_type'] = pd.to_numeric(test_df['interaction_type'], errors='coerce')
test_df.dropna(subset=['interaction_type'], inplace=True)

In [9]:
# Check if the dataframe is still empty after conversion and dropping NA values
if test_df.empty:
    raise ValueError("Test DataFrame is empty after converting 'interaction_type' to numeric and dropping NA values.")

In [10]:
# Convert test data to Spark DataFrame
test_data = [Row(user_id=int(row.user_id), product_id=int(row.product_id), interaction_type=float(row['interaction_type'])) for index, row in test_df.iterrows()]
test_df_spark = spark.createDataFrame(test_data)

In [11]:
print("Test DataFrame:\n", test_df_spark.head(), "\n")

Test DataFrame:
 Row(user_id=2, product_id=488, interaction_type=2.0) 



In [12]:
# Load the trained model using Spark's load method
model_path = os.path.join(BASE_DIR, "als_model")
model = ALSModel.load(model_path)

In [13]:
# Making predictions on test data
predictions = model.transform(test_df_spark)

In [14]:
if predictions.head(1) == []:
    raise ValueError("Predictions DataFrame is empty after model.transform(). Ensure your test data has sufficient entries.")

In [15]:
predictions.show()

+-------+----------+----------------+----------+
|user_id|product_id|interaction_type|prediction|
+-------+----------+----------------+----------+
|      2|       488|             2.0| 1.9074912|
|      6|       719|             3.0| 2.9219954|
|      7|      2138|             1.0| 0.8998625|
|      8|      2300|             2.0| 1.9074911|
|     13|      1438|             1.0|0.89986247|
|     15|      2362|             2.0| 1.9074912|
|     18|       841|             2.0|  1.907491|
|     20|      1900|             3.0| 2.9219952|
|     23|      1472|             1.0| 0.8998625|
|     25|      1201|             2.0|  1.907491|
|     26|       556|             3.0| 2.9219952|
|     28|      2854|             2.0| 1.9074912|
|     33|       411|             1.0| 0.8998625|
|     37|       812|             1.0| 0.8998625|
|     39|      2206|             3.0|  2.921995|
|     41|      1824|             1.0| 0.8998625|
|     48|       882|             2.0|  1.907491|
|     49|      1887|

In [16]:
# Evaluation metrics
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="interaction_type", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")

Root-mean-square error = 0.09117348417152639


In [17]:
# Getting recommendations for users
user_recs = model.recommendForAllUsers(10)
user_recs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|      0|[{859, 2.9219954}...|
|      1|[{1107, 1.4843318...|
|      2|[{889, 2.0025046}...|
|      3|[{826, 1.4612561}...|
|      4|[{1937, 2.1978188...|
|      5|[{2199, 2.1041553...|
|      6|[{719, 2.9219954}...|
|      7|[{2018, 1.5148323...|
|      8|[{2300, 1.9074911...|
|      9|[{1160, 2.9219952...|
|     10|[{1349, 2.921995}...|
|     11|[{1716, 1.3836987...|
|     12|[{2243, 2.032041}...|
|     13|[{60, 1.563861}, ...|
|     14|[{1205, 1.9274771...|
|     15|[{181, 2.2224793}...|
|     16|[{2431, 2.9219952...|
|     17|[{2723, 1.6130643...|
|     18|[{1034, 2.0166566...|
|     19|[{2850, 2.9219952...|
+-------+--------------------+
only showing top 20 rows

