In [1]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql import Row
import pandas as pd
import os
import sys
import pickle

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
# Dynamic path settings
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
TRAIN_TEST_SPLIT_DIR = os.path.join(BASE_DIR, "train_test_split")

In [4]:
spark = SparkSession.builder \
    .appName("RecommenderSystem") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.python.worker.reuse", "false") \
    .config("spark.local.dir", "C:/tmp/spark-temp") \
    .config("spark.network.timeout", "600s") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

In [5]:
# Load training data
train_sales_path = os.path.join(TRAIN_TEST_SPLIT_DIR, "train_sales_data.csv")
train_df = pd.read_csv(train_sales_path)

In [6]:
# Ensure there are data rows in training data
if train_df.empty:
    raise ValueError("Training DataFrame is empty.")

In [7]:
# Convert interaction_type column to numeric
train_df['interaction_type'] = pd.to_numeric(train_df['interaction_type'], errors='coerce')
train_df.dropna(subset=['interaction_type'], inplace=True)

In [8]:
# Check if the dataframe is still empty after conversion and dropping NA values
if train_df.empty:
    raise ValueError("Training DataFrame is empty after converting 'interaction_type' to numeric and dropping NA values.")

In [9]:
print("Train DataFrame:\n", train_df.head(), "\n")

Train DataFrame:
    user id                        product id Interaction type   
0      1.0  4c69b61db1fc16e7013b43fc926e502d         purchase  \
1      2.0  66d49bbed043f5be260fa9f7fbff5957             view   
2      3.0  2c55cae269aebf53838484b0d7dd931a             like   
3      4.0  18018b6bc416dab347b1b7db79994afa             view   
4      5.0  e04b990e95bf73bbe6a3fa09785d7cd0             like   

            Time stamp  Unnamed: 4  user_id  product_id  interaction_type  
0  2023-10-10 08:00:00         NaN        0         905                 3  
1  2023-10-11 08:00:00         NaN        1        1178                 1  
2  2023-10-12 08:00:00         NaN        2         517                 2  
3  2023-10-13 08:00:00         NaN        3         276                 1  
4  2023-10-14 08:00:00         NaN        4        2638                 2   



In [10]:
# Convert training data to Spark DataFrame
train_data = [Row(user_id=int(row.user_id), product_id=int(row.product_id), interaction_type=float(row['interaction_type'])) for index, row in train_df.iterrows()]
train_df_spark = spark.createDataFrame(train_data)

In [11]:
# ALS modeling
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="user_id",
    itemCol="product_id",
    ratingCol="interaction_type",
    coldStartStrategy="drop"
)

In [12]:
# Training the model
model = als.fit(train_df_spark)

In [13]:
# Saving the trained model using Spark's save method
model_path = os.path.join(BASE_DIR, "als_model")
model.save(model_path)

print(f"Model saved to {model_path}")

Model saved to C:\Users\kaank\Desktop\Test\KG-Enhanced-Recommender\als_model
