In [1]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql import Row
import pandas as pd
import os
import sys


In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
# Dynamic path settings
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
TRAIN_TEST_SPLIT_DIR = os.path.join(BASE_DIR, "train_test_split")

In [4]:
spark = SparkSession.builder \
    .appName("RecommenderSystem") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.python.worker.reuse", "false") \
    .config("spark.local.dir", "C:/tmp/spark-temp") \
    .config("spark.network.timeout", "600s") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

In [5]:
# Load training data
train_sales_path = os.path.join(TRAIN_TEST_SPLIT_DIR, "train_sales_data.csv")
train_df = pd.read_csv(train_sales_path)

In [6]:
# Ensure there are data rows in training data
if train_df.empty:
    raise ValueError("Training DataFrame is empty.")

In [7]:
# Convert interaction_type column to numeric
train_df['interaction_type'] = pd.to_numeric(train_df['interaction_type'], errors='coerce')
train_df.dropna(subset=['interaction_type'], inplace=True)

In [8]:
# Check if the dataframe is still empty after conversion and dropping NA values
if train_df.empty:
    raise ValueError("Training DataFrame is empty after converting 'interaction_type' to numeric and dropping NA values.")

In [9]:
# Convert training data to Spark DataFrame
train_data = [Row(user_id=int(row.user_id), product_id=int(row.product_id), interaction_type=float(row['interaction_type'])) for index, row in train_df.iterrows()]
train_df_spark = spark.createDataFrame(train_data)

In [10]:
# ALS modeling
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="user_id",
    itemCol="product_id",
    ratingCol="interaction_type",
    coldStartStrategy="drop"
)

In [11]:
# Training the model
try:
    model = als.fit(train_df_spark)
except Py4JJavaError as e:
    # Hata mesajını yazdır
    print(f"An error occurred: {e.java_exception.getMessage()}")
    # Hatanın detaylarını yazdır
    print(e)

In [12]:
print(model)

ALSModel: uid=ALS_462b795855fd, rank=10


In [13]:
# Saving the trained model
model_path = os.path.join(BASE_DIR, "als_model")
model.save(model_path)