In [52]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import when,col,sum

In [46]:
# Create or get your Spark session
spark = SparkSession.builder \
    .appName("Recommendation") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.cores", "8") \
    .config("spark.default.parallelism", "8") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1024m") \
    .getOrCreate()

In [48]:
# Load the CSV file into a DataFrame
df = spark.read.option("header", "true").csv("new_cleaned_data.csv")

In [49]:
df.printSchema()

root
 |-- Customer_ID: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- Customer_Segment: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Total_Purchases: string (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Product_Brand: string (nullable = true)
 |-- Product_Type: string (nullable = true)
 |-- Feedback: string (nullable = true)
 |-- Shipping_Method: string (nullable = true)
 |-- Payment_Method: string (nullable = true)
 |-- Ratings: string (nullable = true)
 |-- products: string (nullable = true)
 |-- Transaction_ID: string (nullable = true)



In [None]:
# List of string columns to index (modify based on your actual use case)
string_columns = [
    "Customer_ID", 
    "Country", 
    "Gender", 
    "Customer_Segment",
    "Product_Category", 
    "Product_Brand", 
    "Product_Type",
    "Shipping_Method", 
    "Payment_Method",
    "products", 
    "Transaction_ID"
]

# Create StringIndexer stages for each column
indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep")
    for col in string_columns
]


# Create and fit the Pipeline
pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(df).transform(df)

# Ürün ID → ürün adı eşlemesi
product_id_to_name = (
    indexed_df
    .select("products_index", "products")
    .distinct()
    .rdd
    .map(lambda r: (int(r["products_index"]), r["products"]))
    .collectAsMap()
)


for col_name in string_columns:
    if col_name == "products":
        # Orijinal 'products' (string) kalsın, index’i yeni sütuna taşı
        indexed_df = indexed_df.withColumnRenamed("products_index", "product_id")
    else:
        indexed_df = indexed_df.drop(col_name) \
                               .withColumnRenamed(f"{col_name}_index", col_name)


indexed_df.show(5)

+----+------+------+-------+---------------+---------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+
| Age|Income|  Year|  Month|Total_Purchases| Feedback|Ratings|Customer_ID|Country|Gender|Customer_Segment|Product_Category|Product_Brand|Product_Type|Shipping_Method|Payment_Method|products|Transaction_ID|
+----+------+------+-------+---------------+---------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+
|26.0|   1.8|2023.0|   July|            3.0|     Good|    4.0|    22362.0|    0.0|   0.0|             0.0|             4.0|          7.0|        27.0|            1.0|           0.0|   245.0|      218625.0|
|26.0|   1.8|2023.0|    May|            7.0|Excellent|    5.0|    22362.0|    0.0|   0.0|             0.0|             1.0|          1.0|         0.0|            0.0|          

25/04/17 00:08:07 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB


In [53]:
indexed_df = indexed_df.withColumn(
    "Feedback_Rating",
    when(col("Feedback") == "Bad", 1.0)
    .when(col("Feedback") == "Average", 2.0)
    .when(col("Feedback") == "Good", 3.0)
    .when(col("Feedback") == "Excellent", 4.0)
    .otherwise(None)
)

indexed_df = indexed_df.drop("Feedback")
indexed_df.show(5)


+----+------+------+-------+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+---------------+
| Age|Income|  Year|  Month|Total_Purchases|Ratings|Customer_ID|Country|Gender|Customer_Segment|Product_Category|Product_Brand|Product_Type|Shipping_Method|Payment_Method|products|Transaction_ID|Feedback_Rating|
+----+------+------+-------+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+---------------+
|26.0|   1.8|2023.0|   July|            3.0|    4.0|    22362.0|    0.0|   0.0|             0.0|             4.0|          7.0|        27.0|            1.0|           0.0|   245.0|      218625.0|            3.0|
|26.0|   1.8|2023.0|    May|            7.0|    5.0|    22362.0|    0.0|   0.0|             0.0|             1.0|          1.0|         0.0|            

25/04/17 00:08:15 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB


In [55]:
indexed_df = indexed_df.withColumn(
    "Month_Index",
    when((col("Month") == "January") | (col("Month") == "1.0"), 1)
    .when((col("Month") == "February") | (col("Month") == "2.0"), 2)
    .when((col("Month") == "March") | (col("Month") == "3.0"), 3)
    .when((col("Month") == "April") | (col("Month") == "4.0"), 4)
    .when((col("Month") == "May") | (col("Month") == "5.0"), 5)
    .when((col("Month") == "June") | (col("Month") == "6.0"), 6)
    .when((col("Month") == "July") | (col("Month") == "7.0"), 7)
    .when((col("Month") == "August") | (col("Month") == "8.0"), 8)
    .when((col("Month") == "September") | (col("Month") == "9.0"), 9)
    .when((col("Month") == "October") | (col("Month") == "10.0"), 10)
    .when((col("Month") == "November") | (col("Month") == "11.0"), 11)
    .when((col("Month") == "December") | (col("Month") == "12.0"), 12)
    .otherwise(0.0)
)
indexed_df = indexed_df.drop("Month")


In [56]:
null_counts = indexed_df.select([sum(col(c).isNull().cast("int")).alias(c) for c in indexed_df.columns])
null_counts.show()

+---+------+----+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+---------------+-----------+
|Age|Income|Year|Total_Purchases|Ratings|Customer_ID|Country|Gender|Customer_Segment|Product_Category|Product_Brand|Product_Type|Shipping_Method|Payment_Method|products|Transaction_ID|Feedback_Rating|Month_Index|
+---+------+----+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+---------------+-----------+
|  0|     0|   0|              0|      0|          0|      0|     0|               0|               0|            0|           0|              0|             0|       0|             0|              0|          0|
+---+------+----+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+---

In [57]:
# Cast numerical columns
indexed_df = indexed_df.withColumn("Ratings", col("Ratings").cast("Double"))
indexed_df = indexed_df.withColumn("Age", col("Age").cast("Int"))
indexed_df = indexed_df.withColumn("Year", col("Year").cast("Int"))
indexed_df = indexed_df.withColumn("Income", col("Income").cast("Double"))
indexed_df = indexed_df.withColumn("Customer_ID", col("Customer_ID").cast("int"))
indexed_df = indexed_df.withColumn("product_id", col("product_id").cast("int"))

# Weight parameter
alpha = 0.5

# Create a new column "Combined_Rating" as a weighted average of Ratings and Feedback_rating.
indexed_df = indexed_df.withColumn(
    "Combined_Rating",
    alpha * col("Ratings") + (1 - alpha) * col("Feedback_Rating")
)

indexed_df.select("Ratings", "Feedback_Rating", "Combined_Rating").show(5)


+-------+---------------+---------------+
|Ratings|Feedback_Rating|Combined_Rating|
+-------+---------------+---------------+
|    4.0|            3.0|            3.5|
|    5.0|            4.0|            4.5|
|    3.0|            3.0|            3.0|
|    2.0|            2.0|            2.0|
|    5.0|            4.0|            4.5|
+-------+---------------+---------------+
only showing top 5 rows



In [58]:
reordered_cols = [
    "Customer_ID", 
    "Age", 
    "Gender", 
    "Country", 
    "Income", 
    "Customer_Segment",
    "Transaction_ID", 
    "Year", 
    "Month_Index", 
    "Payment_Method", 
    "Shipping_Method", 
    "Total_Purchases", 
    "Products",
    "Product_Category", 
    "Product_Brand", 
    "Product_Type",
    "Ratings", 
    "Feedback_Rating",
    "Combined_Rating"
]

# Reorder the columns in the DataFrame
indexed_df = indexed_df.select(reordered_cols)

# Show the result
indexed_df.show(5)

+-----------+---+------+-------+------+----------------+--------------+----+-----------+--------------+---------------+---------------+--------+----------------+-------------+------------+-------+---------------+---------------+
|Customer_ID|Age|Gender|Country|Income|Customer_Segment|Transaction_ID|Year|Month_Index|Payment_Method|Shipping_Method|Total_Purchases|Products|Product_Category|Product_Brand|Product_Type|Ratings|Feedback_Rating|Combined_Rating|
+-----------+---+------+-------+------+----------------+--------------+----+-----------+--------------+---------------+---------------+--------+----------------+-------------+------------+-------+---------------+---------------+
|      22362| 26|   0.0|    0.0|   1.8|             0.0|      218625.0|2023|        7.0|           0.0|            1.0|            3.0|     245|             4.0|          7.0|        27.0|    4.0|            3.0|            3.5|
|      22362| 26|   0.0|    0.0|   1.8|             0.0|      145963.0|2023|        

25/04/17 00:08:19 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB


In [59]:
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

+-----------+-------+---+------+------+----------------+----+-----+---------------+----------------+-------------+------------+--------+---------------+--------------+-------+--------+--------------+
|Customer_ID|Country|Age|Gender|Income|Customer_Segment|Year|Month|Total_Purchases|Product_Category|Product_Brand|Product_Type|Feedback|Shipping_Method|Payment_Method|Ratings|products|Transaction_ID|
+-----------+-------+---+------+------+----------------+----+-----+---------------+----------------+-------------+------------+--------+---------------+--------------+-------+--------+--------------+
|          0|      0|  0|     0|     0|               0|   0|    0|              0|               0|            0|           0|       0|              0|             0|      0|       0|             0|
+-----------+-------+---+------+------+----------------+----+-----+---------------+----------------+-------------+------------+--------+---------------+--------------+-------+--------+--------------+


### Recommendation Phase 1

1. Generate Recommendations with Ratings

In [60]:
from pyspark.ml.evaluation import RegressionEvaluator

In [61]:
train, test = indexed_df.randomSplit([0.8, 0.2], seed=42)

# Initialize the ALS (Alternating Least Squares) model.
als = ALS(
    userCol="Customer_ID",  
    itemCol="Products",      
    ratingCol="Ratings",     
    rank=10,                  
    maxIter=10,               
    regParam=0.1,             
    coldStartStrategy="drop"  
)

In [62]:
model = als.fit(train)
predictions = model.transform(test)
userRecs = model.recommendForAllUsers(5)
userRecs.show(5, truncate=False)

25/04/17 00:08:20 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:20 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:21 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:22 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:22 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:23 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:24 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:24 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:25 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:26 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:26 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:27 WARN DAGScheduler: Broadc

+-----------+-----------------------------------------------------------------------------------------+
|Customer_ID|recommendations                                                                          |
+-----------+-----------------------------------------------------------------------------------------+
|12         |[{103, 4.7693915}, {281, 4.5628023}, {178, 4.5484624}, {201, 4.539313}, {210, 4.4875326}]|
|13         |[{12, 4.475977}, {84, 4.3350635}, {2, 4.3105946}, {276, 4.3086677}, {53, 4.2081847}]     |
|14         |[{120, 4.877975}, {170, 4.6793165}, {219, 4.673743}, {186, 4.604262}, {298, 4.5730214}]  |
|18         |[{203, 3.9263365}, {99, 3.922779}, {283, 3.8806248}, {174, 3.6710885}, {27, 3.6678486}]  |
|38         |[{317, 4.743094}, {313, 4.5495}, {154, 4.435698}, {45, 4.273876}, {116, 4.241738}]       |
+-----------+-----------------------------------------------------------------------------------------+
only showing top 5 rows



                                                                                

In [63]:
evaluator = RegressionEvaluator(
    metricName="rmse",            
    labelCol="Ratings",     
    predictionCol="prediction"    
)
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error of 1 (RMSE):", rmse)

25/04/17 00:08:45 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:46 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:46 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:47 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB


Root-mean-square error of 1 (RMSE): 2.059274986101763


2. Generate Recommendation with Feedbacks

In [64]:
#spark.stop()

als_fb = ALS(
    userCol="Customer_ID",   
    itemCol="Products",      
    ratingCol="Feedback_Rating",   
    rank=10,                       
    maxIter=10,                    
    regParam=0.1,                  
    coldStartStrategy="drop"       
)

als_model_fb = als_fb.fit(train)
userRecs = als_model_fb.recommendForAllUsers(5)
userRecs.show(5, truncate=False)

25/04/17 00:08:48 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:49 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:50 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:51 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:52 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:52 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:53 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:54 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:54 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:55 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:55 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:08:56 WARN DAGScheduler: Broadc

+-----------+----------------------------------------------------------------------------------------+
|Customer_ID|recommendations                                                                         |
+-----------+----------------------------------------------------------------------------------------+
|12         |[{178, 3.96663}, {84, 3.9010947}, {140, 3.8964798}, {124, 3.8913236}, {201, 3.8352864}] |
|13         |[{213, 4.0697246}, {12, 3.9589896}, {276, 3.9486487}, {84, 3.924839}, {46, 3.9024801}]  |
|14         |[{120, 4.214625}, {68, 3.903142}, {91, 3.859164}, {170, 3.7949846}, {317, 3.6938565}]   |
|18         |[{203, 3.3181999}, {85, 3.2657044}, {78, 3.2066848}, {82, 3.1336923}, {27, 3.0745924}]  |
|38         |[{317, 3.579932}, {313, 3.5220187}, {234, 3.5028577}, {32, 3.3099408}, {274, 3.3040137}]|
+-----------+----------------------------------------------------------------------------------------+
only showing top 5 rows



                                                                                

In [65]:
predictions = als_model_fb.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",            
    labelCol="Feedback_Rating",     
    predictionCol="prediction"    
)
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error of 1 (RMSE):", rmse)

25/04/17 00:09:13 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:13 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:13 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
                                                                                

Root-mean-square error of 1 (RMSE): 1.6033352299479435


25/04/17 00:09:15 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB


3. Generate Recommendations with Combined Ratings

In [66]:
als_combined = ALS(
    userCol="Customer_ID",   
    itemCol="Products",      
    ratingCol="Combined_Rating",   
    rank=10,                       
    maxIter=10,                    
    regParam=0.1,                  
    coldStartStrategy="drop"       
)

als_model_combined = als_combined.fit(train)
userRecs = als_model_combined.recommendForAllUsers(5)
userRecs.show(5, truncate=False)

predictions = als_model_combined.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",            
    labelCol="Combined_Rating",     
    predictionCol="prediction"    
)
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error of 1 (RMSE):", rmse)

25/04/17 00:09:15 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:16 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:17 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:18 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:18 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:19 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:20 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:20 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:21 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:21 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:22 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:23 WARN DAGScheduler: Broadc

+-----------+----------------------------------------------------------------------------------------+
|Customer_ID|recommendations                                                                         |
+-----------+----------------------------------------------------------------------------------------+
|12         |[{103, 4.3394437}, {178, 4.3008456}, {201, 4.219249}, {281, 4.196459}, {53, 4.1775436}] |
|13         |[{12, 4.232196}, {276, 4.1471415}, {213, 4.1373878}, {84, 4.121249}, {46, 4.0797443}]   |
|14         |[{120, 4.560766}, {170, 4.2801867}, {68, 4.178044}, {219, 4.141289}, {91, 4.0751762}]   |
|18         |[{203, 3.6305933}, {99, 3.5072925}, {283, 3.4200575}, {27, 3.387203}, {78, 3.3618288}]  |
|38         |[{317, 4.162074}, {313, 4.0089445}, {234, 3.9333107}, {154, 3.8581977}, {45, 3.7797856}]|
+-----------+----------------------------------------------------------------------------------------+
only showing top 5 rows



25/04/17 00:09:40 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:41 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:42 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:44 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB


Root-mean-square error of 1 (RMSE): 1.8089126377953484


### Recommendation Phase 2

1. User Profile Vector

In [67]:
from pyspark.ml.feature import VectorAssembler

distinct_users = indexed_df.select("Customer_ID", "Age", "Gender", "Country", "Income", "Customer_Segment").distinct()

user_profile_vector = VectorAssembler(
    inputCols=["Customer_ID","Age", "Gender", "Country", "Income", "Customer_Segment"],
    outputCol="user_profile_vector"
).transform(distinct_users)


user_profile_vector.select("Customer_ID", "user_profile_vector").show(5, truncate=False)

25/04/17 00:09:44 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


+-----------+------------------------------+
|Customer_ID|user_profile_vector           |
+-----------+------------------------------+
|38664      |[38664.0,26.0,1.0,2.0,2.0,0.0]|
|58182      |[58182.0,38.0,0.0,0.0,1.0,0.0]|
|22370      |[22370.0,31.0,1.0,4.0,1.5,0.0]|
|4925       |[4925.0,20.0,0.0,3.0,2.2,1.0] |
|58188      |[58188.0,30.0,0.0,0.0,2.0,0.0]|
+-----------+------------------------------+
only showing top 5 rows



25/04/17 00:09:45 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


2. Product Vector

In [68]:

product_vector = VectorAssembler(
    inputCols=["Transaction_ID","Product_Category", "Product_Brand", "Product_Type","Products"],
    outputCol="product_vector"
).transform(indexed_df)


product_vector.select("Products", "product_vector").show(5, truncate=False)

+--------+-----------------------------+
|Products|product_vector               |
+--------+-----------------------------+
|245     |[218625.0,4.0,7.0,27.0,245.0]|
|10      |[145963.0,1.0,1.0,0.0,10.0]  |
|279     |[49995.0,2.0,11.0,22.0,279.0]|
|111     |[210534.0,0.0,2.0,9.0,111.0] |
|182     |[13653.0,2.0,3.0,18.0,182.0] |
+--------+-----------------------------+
only showing top 5 rows



25/04/17 00:09:45 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB


3. Transaction Vector  

In [69]:

transaction_vector = VectorAssembler(
    inputCols=["Transaction_ID","Customer_ID","Month_Index","Year","Combined_Rating"],
    outputCol="transaction_vector"
).transform(indexed_df)


transaction_vector.select("Transaction_ID", "transaction_vector").show(5, truncate=False)

+--------------+---------------------------------+
|Transaction_ID|transaction_vector               |
+--------------+---------------------------------+
|218625.0      |[218625.0,22362.0,7.0,2023.0,3.5]|
|145963.0      |[145963.0,22362.0,5.0,2023.0,4.5]|
|49995.0       |[49995.0,22362.0,1.0,2023.0,3.0] |
|210534.0      |[210534.0,22362.0,7.0,2023.0,2.0]|
|13653.0       |[13653.0,11310.0,10.0,2023.0,4.5]|
+--------------+---------------------------------+
only showing top 5 rows



25/04/17 00:09:46 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB


In [70]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
scaler_user = MinMaxScaler(inputCol="user_profile_vector", outputCol="user_profile_scaled")
scaler_model_user = scaler_user.fit(user_profile_vector)
user_profile_scaled = scaler_model_user.transform(user_profile_vector)

user_profile_scaled.select("Customer_ID", "user_profile_scaled").show(5, truncate=False)

25/04/17 00:09:46 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:09:46 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:09:46 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:09:47 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


+-----------+--------------------------------------------------------------------------+
|Customer_ID|user_profile_scaled                                                       |
+-----------+--------------------------------------------------------------------------+
|38664      |[0.4457253527621507,0.15384615384615385,1.0,0.5,0.5,0.0]                  |
|58182      |(6,[0,1],[0.6707322696670663,0.38461538461538464])                        |
|22370      |[0.2578852716037997,0.25,1.0,1.0,0.25,0.0]                                |
|4925       |[0.05677626118232961,0.038461538461538464,0.0,0.75,0.6000000000000001,0.5]|
|58188      |[0.6708014387162224,0.23076923076923078,0.0,0.0,0.5,0.0]                  |
+-----------+--------------------------------------------------------------------------+
only showing top 5 rows



25/04/17 00:09:47 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


In [71]:
# Then apply scaling
scaler_product = MinMaxScaler(inputCol="product_vector", outputCol="product_vector_scaled")
scaler_model_product = scaler_product.fit(product_vector)
product_scaled = scaler_model_product.transform(product_vector)

product_scaled.select("Products", "product_vector_scaled").show(5, truncate=False)

25/04/17 00:09:47 WARN DAGScheduler: Broadcasting large task binary with size 9.7 MiB
25/04/17 00:09:48 WARN DAGScheduler: Broadcasting large task binary with size 9.7 MiB


+--------+-----------------------------------------------------------------------+
|Products|product_vector_scaled                                                  |
+--------+-----------------------------------------------------------------------+
|245     |[0.7259189162267159,0.8,0.38888888888888884,0.84375,0.7728706624605678]|
|10      |[0.48465318590829104,0.2,0.05555555555555555,0.0,0.031545741324921134] |
|279     |[0.16600258989939237,0.4,0.611111111111111,0.6875,0.8801261829652997]  |
|111     |[0.6990536906066341,0.0,0.1111111111111111,0.28125,0.3501577287066246] |
|182     |[0.04533320051797988,0.4,0.16666666666666666,0.5625,0.5741324921135647]|
+--------+-----------------------------------------------------------------------+
only showing top 5 rows



25/04/17 00:09:48 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB


In [72]:
# Then apply scaling
scaler_transaction = MinMaxScaler(inputCol="transaction_vector", outputCol="transaction_vector_scaled")
scaler_model_transaction = scaler_transaction.fit(transaction_vector)
transaction_vector_scaled = scaler_model_transaction.transform(transaction_vector)

transaction_vector_scaled.select("Transaction_ID", "transaction_vector_scaled").show(5, truncate=False)

25/04/17 00:09:49 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
25/04/17 00:09:50 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
                                                                                

+--------------+---------------------------------------------------------------------------------+
|Transaction_ID|transaction_vector_scaled                                                        |
+--------------+---------------------------------------------------------------------------------+
|218625.0      |[0.7259189162267159,0.2577930462049248,0.5454545454545454,0.0,0.7142857142857142]|
|145963.0      |[0.48465318590829104,0.2577930462049248,0.36363636363636365,0.0,1.0]             |
|49995.0       |[0.16600258989939237,0.2577930462049248,0.0,0.0,0.5714285714285714]              |
|210534.0      |[0.6990536906066341,0.2577930462049248,0.5454545454545454,0.0,0.2857142857142857]|
|13653.0       |[0.04533320051797988,0.13038365765931936,0.8181818181818182,0.0,1.0]             |
+--------------+---------------------------------------------------------------------------------+
only showing top 5 rows



25/04/17 00:09:50 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB


In [73]:
# Make sure all the necessary columns are selected
user_profile_scaled = user_profile_scaled.select("Customer_ID", "user_profile_scaled")
product_scaled = product_scaled.select("Transaction_ID", "Products", "product_vector_scaled")
transaction_scaled = transaction_vector_scaled.select(
    "Transaction_ID", "Customer_ID", "transaction_vector_scaled", "Combined_Rating"
)# Join user and transaction data on Customer_ID
user_transaction_joined = transaction_scaled.join(user_profile_scaled, on="Customer_ID", how="inner")

# Now join with product data on Transaction_ID
hybrid_joined = user_transaction_joined.join(product_scaled, on="Transaction_ID", how="inner")

hybrid_assembler = VectorAssembler(
    inputCols=["user_profile_scaled", "transaction_vector_scaled", "product_vector_scaled"],
    outputCol="hybrid_vector"
)

hybrid_df = hybrid_assembler.transform(hybrid_joined)
hybrid_df.select("Customer_ID", "Products", "hybrid_vector", "Combined_Rating").show(5, truncate=False)

25/04/17 00:09:50 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:09:51 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/17 00:09:52 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/17 00:09:53 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB


+-----------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|Customer_ID|Products|hybrid_vector                                                                                                                                                                                                                 |Combined_Rating|
+-----------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|38664      |29      |[0.4457253527621507,0.15384615384615385,1.0,0.5,0.5,0.0,0.8020984825845868,0.4457253527621507,0.8181818181818182,0.0,1.0,0.8020984825845868,0.6000000000000001,0.5,0.0625,0.0914826498422713]   

In [74]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.ml.linalg import DenseVector, VectorUDT
import numpy as np

user_vectors = hybrid_df.select("Customer_ID", "hybrid_vector").distinct()

# 1. Önce vektörleri aynı product altında toplayalım
vector_list_df = hybrid_df.groupBy("Products").agg(
    F.first("Transaction_ID").alias("Transaction_ID"),
    F.first("Customer_ID").alias("Customer_ID"),
    F.collect_list("hybrid_vector").alias("hybrid_vector_list")
)

# 2. UDF: Vektörleri ortalamak için Python fonksiyonu
def average_vectors(vectors):
    if not vectors:
        return None
    arrays = [np.array(v.toArray()) for v in vectors]
    avg_array = np.mean(arrays, axis=0)
    return avg_array.tolist()

average_vectors_udf = F.udf(average_vectors, ArrayType(DoubleType()))

# 3. Ortalamasını al
vector_list_df = vector_list_df.withColumn(
    "avg_vector_array", average_vectors_udf("hybrid_vector_list")
)

# 4. (Opsiyonel) Tekrar DenseVector'e çevir
def array_to_vector(arr):
    if arr is None:
        return None
    return DenseVector(arr)

array_to_vector_udf = F.udf(array_to_vector, VectorUDT())

# 5. Son hali
product_vectors = vector_list_df.withColumn(
    "product_vector", array_to_vector_udf("avg_vector_array")
).select("Products", "product_vector")

product_vectors.show(truncate=False)


25/04/17 00:09:53 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:09:54 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/17 00:09:55 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/17 00:09:56 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:09:57 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
[Stage 539:>                                                        (0 + 1) / 1]

+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Products|product_vector                                                                                                                                                                                                                                                                                                            |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|12      |[0.359917311

                                                                                

In [75]:
from pyspark.sql.functions import udf
def cosine_similarity(vec1, vec2):
    if vec1 is None or vec2 is None:
        return None
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return float(np.dot(vec1, vec2) / (norm1 * norm2))

cosine_sim_udf = udf(cosine_similarity, DoubleType())

In [76]:
# Cross join: every user with every product
user_product_scores = user_vectors.crossJoin(product_vectors)

# Add cosine similarity
user_product_scores = user_product_scores.withColumn(
    "similarity", cosine_sim_udf("hybrid_vector", "product_vector")
)

In [77]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec = Window.partitionBy("Customer_ID").orderBy(F.desc("similarity"))

top_n_recommendations = user_product_scores.withColumn("rank", row_number().over(windowSpec)) \
    .filter("rank <= 5")

top_n_recommendations.select("Customer_ID", "Products", "similarity").show(truncate=False)


25/04/17 00:09:58 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:09:59 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/17 00:09:59 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/17 00:10:00 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/17 00:10:01 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:10:01 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:10:03 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:15:39 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
[Stage 555:>                                                        (0 + 1) / 1]

+-----------+--------+------------------+
|Customer_ID|Products|similarity        |
+-----------+--------+------------------+
|12         |212     |0.8861749803080519|
|12         |213     |0.8837212971417542|
|12         |242     |0.8815541763986517|
|12         |239     |0.8814559570708499|
|12         |269     |0.881112519992375 |
|13         |269     |0.9080098769164253|
|13         |213     |0.9077187307490607|
|13         |292     |0.9048125536952083|
|13         |236     |0.9028009729000578|
|13         |275     |0.9026139564873609|
|14         |184     |0.8930218505842851|
|14         |181     |0.8905888486812419|
|14         |203     |0.8903263974207459|
|14         |205     |0.8889383055353763|
|14         |215     |0.8842558192230076|
|18         |286     |0.8985492299457141|
|18         |243     |0.8975734158204195|
|18         |315     |0.8969658103699043|
|18         |239     |0.896063901513035 |
|18         |212     |0.8958413647800829|
+-----------+--------+------------

                                                                                

In [78]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=5, seed=42, featuresCol="hybrid_vector", predictionCol="cluster")
kmeans_model = kmeans.fit(user_vectors)

user_clusters = kmeans_model.transform(user_vectors)
user_clusters.select("Customer_ID", "cluster").show(10)

25/04/17 00:15:40 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:15:40 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/17 00:15:41 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/17 00:15:44 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:15:45 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:15:46 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/17 00:15:46 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/17 00:15:48 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:15:50 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:15:51 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:15:51 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:15:51 WARN DAGScheduler: Broadcastin

+-----------+-------+
|Customer_ID|cluster|
+-----------+-------+
|      38664|      0|
|      58182|      2|
|      22370|      4|
|       4925|      1|
|       4927|      4|
|       1962|      2|
|      38686|      2|
|      22394|      3|
|      58209|      1|
|       1964|      1|
+-----------+-------+
only showing top 10 rows



25/04/17 00:16:21 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
                                                                                

In [79]:
# Join cluster info back to hybrid_df
clustered_data = hybrid_df.join(user_clusters.select("Customer_ID", "cluster"), on="Customer_ID")

# Most popular products in each cluster (based on avg Combined_Rating or frequency)
popular_by_cluster = clustered_data.groupBy("cluster", "Products") \
    .agg(F.avg("Combined_Rating").alias("avg_rating")) \
    .orderBy("cluster", F.desc("avg_rating"))

popular_by_cluster.show(10, truncate=False)

25/04/17 00:16:21 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:16:21 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/17 00:16:22 WARN DAGScheduler: Broadcasting large task binary with size 8.2 MiB
25/04/17 00:16:22 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/17 00:16:23 WARN DAGScheduler: Broadcasting large task binary with size 10.5 MiB
25/04/17 00:16:23 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/17 00:16:24 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
25/04/17 00:16:25 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/17 00:16:26 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB
25/04/17 00:16:28 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
                                                                                

+-------+--------+------------------+
|cluster|Products|avg_rating        |
+-------+--------+------------------+
|0      |317     |4.168067226890757 |
|0      |121     |3.8320964749536177|
|0      |123     |3.813053097345133 |
|0      |124     |3.7895277207392195|
|0      |126     |3.7858880778588806|
|0      |122     |3.772727272727273 |
|0      |128     |3.7676470588235293|
|0      |125     |3.7367816091954023|
|0      |127     |3.7314629258517034|
|0      |120     |3.675704989154013 |
+-------+--------+------------------+
only showing top 10 rows



TimeBased Deneme

In [91]:
from pyspark.sql.functions import col
from pyspark.ml.linalg import Vectors
import numpy as np
from pyspark.ml.feature import Word2Vec



window_spec = Window.partitionBy("Customer_ID").orderBy("Year", "Month_Index", "Transaction_ID")

user_sequences = (
    indexed_df
    .withColumn("ordered_products", F.collect_list("Products").over(window_spec))
    .groupBy("Customer_ID")
    .agg(F.max("ordered_products").alias("product_sequence"))
    .withColumn("seq_length", F.size("product_sequence"))
    .filter(F.col("seq_length") >= 2)
)

user_sequences = user_sequences.withColumn(
    "product_sequence_str",
    F.expr("transform(product_sequence, x -> cast(x as string))")
)

w2v = Word2Vec(vectorSize=50, minCount=1, inputCol="product_sequence_str", outputCol="product_embedding")
model = w2v.fit(user_sequences)
product_embeddings = model.getVectors()

# --- Ayarlar ---
top_k = 5   # Her kullanıcı için kaç öneri alacağız?

# --- Değerlendirme için boş bir liste ---
eval_results = []
# 1) Her kullanıcıyı al, leave‑one‑out: son ürünü ayır, gerisini eğitimde kullan
for row in user_sequences.select("Customer_ID", "product_sequence").collect():
    user_id = row["Customer_ID"]
    seq = row["product_sequence"]
    
    # En az 2 ürün olsun
    if len(seq) < 2:
        continue
    
    train_seq = seq[:-1]               # Kullanılacak n‑1 ürün
    actual_item = str(seq[-1])         # Test item
    
    # 2) Eğitim dizisindeki ürünlerin embedding'lerini çek
    vecs = (
        product_embeddings
        .filter(col("word").isin([str(p) for p in train_seq]))
        .select("vector")
        .rdd
        .map(lambda r: np.array(r[0].toArray()))
        .collect()
    )
    avg_vec = Vectors.dense(np.mean(vecs, axis=0).tolist())
    
    # 3) Ortalamaya en yakın top_k ürünü öner
    sims = (
        model
        .findSynonyms(avg_vec, top_k)
        .withColumnRenamed("word", "predicted")
        .withColumnRenamed("similarity", "score")
        .toPandas()
    )
    preds = sims["predicted"].astype(str).tolist()
    
    # 4) Performans ölçümü: hit@k ve rank
    hit = actual_item in preds
    rank = preds.index(actual_item) + 1 if hit else None
    
    eval_results.append({
        "Customer_ID": user_id,
        "actual": actual_item,
        "preds": preds,
        "hit": hit,
        "rank": rank
    })

# 5) Sonuçları bir DataFrame’e çevir ve global metric hesapla
eval_df = spark.createDataFrame(eval_results)

# Hit@K (accuracy)
total = eval_df.count()
hits  = eval_df.filter(col("hit") == True).count()
hit_rate = hits / total if total else 0

# Ortalama reciprocal rank (MRR)
from pyspark.sql.functions import avg, expr
mrr = (
    eval_df
    .filter(col("rank").isNotNull())
    .withColumn("recip_rank", expr("1.0 / rank"))
    .agg(avg("recip_rank"))
    .first()[0]
)

print(f"Evaluasyon sonuçları (n={total} kullanıcı):")
print(f"  • Hit@{top_k}: {hit_rate:.3f}")
print(f"  • MRR@{top_k}: {mrr:.3f}")

# İncelemek için birkaç satırı göster
eval_df.show(10, truncate=False)


25/04/17 01:55:30 WARN DAGScheduler: Broadcasting large task binary with size 12.0 MiB
25/04/17 01:55:31 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
25/04/17 01:55:32 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
25/04/17 01:55:33 WARN DAGScheduler: Broadcasting large task binary with size 12.0 MiB
25/04/17 01:55:34 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
                                                                                

Evaluasyon sonuçları (n=75962 kullanıcı):
  • Hit@5: 0.013
  • MRR@5: 0.489
+-----------+------+-----+-------------------------+----+
|Customer_ID|actual|hit  |preds                    |rank|
+-----------+------+-----+-------------------------+----+
|6          |59    |false|[216, 158, 243, 239, 130]|NULL|
|9          |91    |false|[110, 262, 256, 307, 243]|NULL|
|12         |303   |false|[158, 316, 216, 217, 243]|NULL|
|13         |176   |false|[243, 158, 282, 305, 216]|NULL|
|14         |123   |false|[262, 173, 243, 178, 152]|NULL|
|16         |28    |false|[189, 158, 243, 216, 305]|NULL|
|17         |68    |false|[262, 158, 217, 256, 316]|NULL|
|18         |16    |false|[243, 216, 158, 282, 189]|NULL|
|23         |255   |false|[110, 161, 217, 186, 316]|NULL|
|38         |169   |false|[243, 217, 158, 152, 272]|NULL|
+-----------+------+-----+-------------------------+----+
only showing top 10 rows



25/04/17 08:56:22 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 544875 ms exceeds timeout 120000 ms
25/04/17 08:56:22 WARN SparkContext: Killing executors is not supported by current scheduler.
25/04/17 08:56:23 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

Time Based Recommendations

In [80]:
import json
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lead, when, collect_list, array
from pyspark.sql.types import ArrayType, StringType

In [81]:
# Helper Functions for Time-Based Recommendations

def remove_duplicates(items):
    """Remove duplicates while preserving order"""
    if items is None:
        return None
    seen = set()
    result = []
    for item in items:
        if item not in seen:
            seen.add(item)
            result.append(item)
    return result

def create_sliding_windows(product_list, window_size=2):
    """Create sliding windows of unique items"""
    if not product_list or len(product_list) < window_size:
        return []
    unique_products = remove_duplicates(product_list)
    result = []
    for i in range(len(unique_products) - window_size + 1):
        result.append(unique_products[i:i+window_size])
    return result

def create_unique_basket(items):
    """Create a unique basket of items"""
    return list(set(items))


In [82]:
def build_next_purchase_model(sorted_df, verbose=False):
    """Build the next purchase prediction model"""
    # UDFs for various operations
    remove_duplicates_udf = udf(remove_duplicates, ArrayType(StringType()))
    create_windows_udf = udf(create_sliding_windows, ArrayType(ArrayType(StringType())))
    create_unique_basket_udf = udf(create_unique_basket, ArrayType(StringType()))
    
    # Create the window specification for partitioning by customer
    window_spec = Window.partitionBy("Customer_ID").orderBy("time_order", "Transaction_ID")
    
    # Get the next product for each purchase
    next_purchase_df = sorted_df.withColumn(
        "next_product", lead("products").over(window_spec)
    )
    
    # Filter out rows with null next products (last purchase of each customer)
    next_purchase_df = next_purchase_df.filter(col("next_product").isNotNull())
    
    # Group by product and aggregate the next products
    next_product_freq = next_purchase_df.groupBy("products").agg(
        F.count("*").alias("frequency"),
        F.collect_list("next_product").alias("next_product_list")
    )
    
    # Print top products if verbose is True
    if verbose:
        print("\nMost frequently purchased products and their follow-up products:")
        next_product_freq.orderBy(F.desc("frequency")).limit(5).show(truncate=False)
    
    # Build the transition matrix: product -> next products
    transition_matrix = {}
    next_items_df = next_product_freq.collect()
    
    for row in next_items_df:
        current_product = row["products"]
        next_products = row["next_product_list"]
        
        # Count the occurrences of each next product
        from collections import Counter
        product_counts = Counter(next_products)
        
        # Get the top 3 most frequent next products
        transition_matrix[current_product] = [item[0] for item in product_counts.most_common(3)]
    
    # Save the transition model to a file
    with open("next_purchase_model.json", "w") as f:
        json.dump({str(k): [str(i) for i in v] for k, v in transition_matrix.items()}, f)
    
    if verbose:
        print("\nTime-based recommendation model saved to 'next_purchase_model.json'")
    
    return transition_matrix


In [83]:
def get_customer_purchases(sorted_df, customer_id, verbose=False):
    """Belirli bir müşteri için satın alma geçmişini al ve ekrana bas"""
    customer_purchases = sorted_df.filter(col("Customer_ID") == customer_id) \
                             .orderBy("time_order", "Transaction_ID") \
                             .select("time_order", "products", "Product_Type") \
                             .collect()

    if not customer_purchases:
        print(f"\n⚠️ Müşteri ID {customer_id} için satın alma geçmişi bulunamadı.")
        return None

    print(f"\n📋 SATINALMA GEÇMİŞİ (Müşteri ID: {customer_id}):")
    print("┌" + "─" * 60 + "┐")
    for i, purchase in enumerate(customer_purchases, start=1):
        t = purchase["time_order"]
        year = int(t // 100)        # tam sayı yap
        month = int(t % 100)        # tam sayı yap

        prod_str = str(purchase["products"])
        type_str = str(purchase["Product_Type"])
        padding = max(0, 58 - len(prod_str) - len(type_str))

        print(f"│ {i}. {year}-{month:02d}: {prod_str} ({type_str})" + " " * padding + "│")
    print("└" + "─" * 60 + "┘")

    return customer_purchases


In [84]:
def recommend_next_purchase(sorted_df, next_product_matrix, customer_id):
    """Belirli bir müşteri için sonraki satın almaları öner"""
    latest_purchases = (
        sorted_df
        .filter(col("Customer_ID") == customer_id)
        .orderBy(F.desc("time_order"), F.desc("Transaction_ID"))
        .select("products", "time_order")
        .limit(3)
        .collect()
    )

    # Satın alma geçmişi yoksa popüler ürünleri döndür
    if not latest_purchases:
        popular_products = (
            sorted_df
            .groupBy("products")
            .count()
            .orderBy(F.desc("count"))
            .select("products")
            .limit(3)
            .collect()
        )
        return [row["products"] for row in popular_products], \
               "popüler ürünler (satın alma geçmişi yok)"

    # En son satın alınan ürün ve zamanını al
    latest_product = latest_purchases[0]["products"]
    t = latest_purchases[0]["time_order"]
    year = int(t // 100)
    month = int(t % 100)

    # Modelde varsa öneriyi dön
    if latest_product in next_product_matrix:
        recs = next_product_matrix[latest_product]
        reason = f"son satın alım: {latest_product} ({year}-{month:02d})"
        return recs, reason
    else:
        # Yoksa gene popüler ürünler
        popular_products = (
            sorted_df
            .groupBy("products")
            .count()
            .orderBy(F.desc("count"))
            .select("products")
            .limit(3)
            .collect()
        )
        recs = [row["products"] for row in popular_products]
        reason = f"popüler ürünler ({latest_product} için model yok)"
        return recs, reason


In [85]:
def format_recommendations(recommendations, reason):
    """
    Öneri listesini ve nedenini güzel bir kutu içinde ekrana basar.
    """
    print(f"\n🔮 ÖNERİLER ({reason}):")
    print("┌" + "─" * 50 + "┐")
    for i, rec in enumerate(recommendations, start=1):
        rec_str = str(rec)
        padding = max(0, 47 - len(rec_str))
        print(f"│ {i}. {rec_str}" + " " * padding + "│")
    print("└" + "─" * 50 + "┘")


In [88]:
# Initialize Spark session and load data
# Zaman sıralama için sütun oluştur
indexed_df = indexed_df.withColumn("time_order", 
                                    col("Year") * 100 + col("Month_Index"))

# Müşteri ve zamana göre sırala
sorted_df = indexed_df.orderBy("Customer_ID", "time_order", "Transaction_ID")

# Build next purchase model
next_product_matrix = build_next_purchase_model(sorted_df, verbose=True)

# Recommend products for each customer
customer_ids = ["10001.0"]
for customer_id in customer_ids:
    # Display purchase history if needed
    get_customer_purchases(sorted_df, customer_id, verbose=True)
    
    # Recommend products
    recommendations, reason = recommend_next_purchase(sorted_df, next_product_matrix, customer_id)
    format_recommendations(recommendations, reason)




Most frequently purchased products and their follow-up products:


25/04/17 00:20:12 WARN DAGScheduler: Broadcasting large task binary with size 12.0 MiB
25/04/17 00:20:13 WARN DAGScheduler: Broadcasting large task binary with size 12.0 MiB
25/04/17 00:20:14 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
25/04/17 00:20:14 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
25/04/17 00:20:15 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB


+--------+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

25/04/17 00:20:16 WARN DAGScheduler: Broadcasting large task binary with size 12.0 MiB
25/04/17 00:20:16 WARN DAGScheduler: Broadcasting large task binary with size 12.0 MiB
25/04/17 00:20:17 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
25/04/17 00:20:17 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
25/04/17 00:20:18 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB



Time-based recommendation model saved to 'next_purchase_model.json'


25/04/17 00:20:19 WARN DAGScheduler: Broadcasting large task binary with size 12.0 MiB
25/04/17 00:20:20 WARN DAGScheduler: Broadcasting large task binary with size 12.0 MiB
25/04/17 00:20:20 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
                                                                                


📋 SATINALMA GEÇMİŞİ (Müşteri ID: 10001.0):
┌────────────────────────────────────────────────────────────┐
│ 1. 2023-04: 272 (28.0)                                                   │
│ 2. 2023-04: 31 (1.0)                                                     │
│ 3. 2023-04: 90 (9.0)                                                     │
│ 4. 2023-05: 269 (24.0)                                                   │
│ 5. 2023-09: 11 (0.0)                                                     │
│ 6. 2023-10: 117 (6.0)                                                    │
└────────────────────────────────────────────────────────────┘


25/04/17 00:20:21 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB



🔮 ÖNERİLER (son satın alım: 117 (2023-10)):
┌──────────────────────────────────────────────────┐
│ 1. 22                                             │
│ 2. 69                                             │
│ 3. 7                                              │
└──────────────────────────────────────────────────┘


In [87]:
#spark.stop()