In [43]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import when,col,sum

In [38]:
# Create or get your Spark session
spark = SparkSession.builder \
    .appName("Recommendation") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.cores", "8") \
    .config("spark.default.parallelism", "8") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1024m") \
    .getOrCreate()

In [39]:
# Load the CSV file into a DataFrame
df = spark.read.option("header", "true").csv("new_cleaned_data.csv")

In [40]:
df.printSchema()

root
 |-- Customer_ID: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- Customer_Segment: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Total_Purchases: string (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Product_Brand: string (nullable = true)
 |-- Product_Type: string (nullable = true)
 |-- Feedback: string (nullable = true)
 |-- Shipping_Method: string (nullable = true)
 |-- Payment_Method: string (nullable = true)
 |-- Ratings: string (nullable = true)
 |-- products: string (nullable = true)
 |-- Transaction_ID: string (nullable = true)



In [41]:
# List of string columns to index (modify based on your actual use case)
string_columns = [
    "Customer_ID", 
    "Country", 
    "Gender", 
    "Customer_Segment",
    "Product_Category", 
    "Product_Brand", 
    "Product_Type",
    "Shipping_Method", 
    "Payment_Method",
    "products", 
    "Transaction_ID"
]

# Create StringIndexer stages for each column
indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep")
    for col in string_columns
]


# Create and fit the Pipeline
pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(df).transform(df)


for col in string_columns:
    indexed_df = indexed_df.drop(col).withColumnRenamed(f"{col}_index", col)

indexed_df.show(5)

+----+------+------+-------+---------------+---------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+
| Age|Income|  Year|  Month|Total_Purchases| Feedback|Ratings|Customer_ID|Country|Gender|Customer_Segment|Product_Category|Product_Brand|Product_Type|Shipping_Method|Payment_Method|products|Transaction_ID|
+----+------+------+-------+---------------+---------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+
|26.0|   1.8|2023.0|   July|            3.0|     Good|    4.0|    22362.0|    0.0|   0.0|             0.0|             4.0|          7.0|        27.0|            1.0|           0.0|   245.0|      218625.0|
|26.0|   1.8|2023.0|    May|            7.0|Excellent|    5.0|    22362.0|    0.0|   0.0|             0.0|             1.0|          1.0|         0.0|            0.0|          

25/04/16 19:52:51 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB


In [44]:
indexed_df = indexed_df.withColumn(
    "Feedback_Rating",
    when(col("Feedback") == "Bad", 1.0)
    .when(col("Feedback") == "Average", 2.0)
    .when(col("Feedback") == "Good", 3.0)
    .when(col("Feedback") == "Excellent", 4.0)
    .otherwise(None)
)

indexed_df = indexed_df.drop("Feedback")
indexed_df.show(5)


25/04/16 19:53:35 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB


+----+------+------+-------+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+---------------+
| Age|Income|  Year|  Month|Total_Purchases|Ratings|Customer_ID|Country|Gender|Customer_Segment|Product_Category|Product_Brand|Product_Type|Shipping_Method|Payment_Method|products|Transaction_ID|Feedback_Rating|
+----+------+------+-------+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+---------------+
|26.0|   1.8|2023.0|   July|            3.0|    4.0|    22362.0|    0.0|   0.0|             0.0|             4.0|          7.0|        27.0|            1.0|           0.0|   245.0|      218625.0|            3.0|
|26.0|   1.8|2023.0|    May|            7.0|    5.0|    22362.0|    0.0|   0.0|             0.0|             1.0|          1.0|         0.0|            

In [45]:
null_counts = indexed_df.select([sum(col(c).isNull().cast("int")).alias(c) for c in indexed_df.columns])
null_counts.show()

+---+------+----+-----+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+---------------+
|Age|Income|Year|Month|Total_Purchases|Ratings|Customer_ID|Country|Gender|Customer_Segment|Product_Category|Product_Brand|Product_Type|Shipping_Method|Payment_Method|products|Transaction_ID|Feedback_Rating|
+---+------+----+-----+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+---------------+
|  0|     0|   0|    0|              0|      0|          0|      0|     0|               0|               0|            0|           0|              0|             0|       0|             0|              0|
+---+------+----+-----+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+------

In [46]:
indexed_df = indexed_df.withColumn(
    "Month_Index",
    when((col("Month") == "January") | (col("Month") == "1.0"), 1)
    .when((col("Month") == "February") | (col("Month") == "2.0"), 2)
    .when((col("Month") == "March") | (col("Month") == "3.0"), 3)
    .when((col("Month") == "April") | (col("Month") == "4.0"), 4)
    .when((col("Month") == "May") | (col("Month") == "5.0"), 5)
    .when((col("Month") == "June") | (col("Month") == "6.0"), 6)
    .when((col("Month") == "July") | (col("Month") == "7.0"), 7)
    .when((col("Month") == "August") | (col("Month") == "8.0"), 8)
    .when((col("Month") == "September") | (col("Month") == "9.0"), 9)
    .when((col("Month") == "October") | (col("Month") == "10.0"), 10)
    .when((col("Month") == "November") | (col("Month") == "11.0"), 11)
    .when((col("Month") == "December") | (col("Month") == "12.0"), 12)
    .otherwise(0.0)
)
indexed_df = indexed_df.drop("Month")


In [47]:
null_counts = indexed_df.select([sum(col(c).isNull().cast("int")).alias(c) for c in indexed_df.columns])
null_counts.show()

+---+------+----+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+---------------+-----------+
|Age|Income|Year|Total_Purchases|Ratings|Customer_ID|Country|Gender|Customer_Segment|Product_Category|Product_Brand|Product_Type|Shipping_Method|Payment_Method|products|Transaction_ID|Feedback_Rating|Month_Index|
+---+------+----+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+--------------+--------+--------------+---------------+-----------+
|  0|     0|   0|              0|      0|          0|      0|     0|               0|               0|            0|           0|              0|             0|       0|             0|              0|          0|
+---+------+----+---------------+-------+-----------+-------+------+----------------+----------------+-------------+------------+---------------+---

In [48]:
# Cast numerical columns
indexed_df = indexed_df.withColumn("Ratings", col("Ratings").cast("Double"))
indexed_df = indexed_df.withColumn("Age", col("Age").cast("Int"))
indexed_df = indexed_df.withColumn("Year", col("Year").cast("Int"))
indexed_df = indexed_df.withColumn("Income", col("Income").cast("Double"))
indexed_df = indexed_df.withColumn("Customer_ID", col("Customer_ID").cast("int"))
indexed_df = indexed_df.withColumn("Products", col("products").cast("int"))

# Weight parameter
alpha = 0.5

# Create a new column "Combined_Rating" as a weighted average of Ratings and Feedback_rating.
indexed_df = indexed_df.withColumn(
    "Combined_Rating",
    alpha * col("Ratings") + (1 - alpha) * col("Feedback_Rating")
)

indexed_df.select("Ratings", "Feedback_Rating", "Combined_Rating").show(5)


+-------+---------------+---------------+
|Ratings|Feedback_Rating|Combined_Rating|
+-------+---------------+---------------+
|    4.0|            3.0|            3.5|
|    5.0|            4.0|            4.5|
|    3.0|            3.0|            3.0|
|    2.0|            2.0|            2.0|
|    5.0|            4.0|            4.5|
+-------+---------------+---------------+
only showing top 5 rows



In [49]:
reordered_cols = [
    "Customer_ID", 
    "Age", 
    "Gender", 
    "Country", 
    "Income", 
    "Customer_Segment",
    "Transaction_ID", 
    "Year", 
    "Month_Index", 
    "Payment_Method", 
    "Shipping_Method", 
    "Total_Purchases", 
    "Products",
    "Product_Category", 
    "Product_Brand", 
    "Product_Type",
    "Ratings", 
    "Feedback_Rating",
    "Combined_Rating"
]

# Reorder the columns in the DataFrame
indexed_df = indexed_df.select(reordered_cols)

# Show the result
indexed_df.show(5)

+-----------+---+------+-------+------+----------------+--------------+----+-----------+--------------+---------------+---------------+--------+----------------+-------------+------------+-------+---------------+---------------+
|Customer_ID|Age|Gender|Country|Income|Customer_Segment|Transaction_ID|Year|Month_Index|Payment_Method|Shipping_Method|Total_Purchases|Products|Product_Category|Product_Brand|Product_Type|Ratings|Feedback_Rating|Combined_Rating|
+-----------+---+------+-------+------+----------------+--------------+----+-----------+--------------+---------------+---------------+--------+----------------+-------------+------------+-------+---------------+---------------+
|      22362| 26|   0.0|    0.0|   1.8|             0.0|      218625.0|2023|        7.0|           0.0|            1.0|            3.0|     245|             4.0|          7.0|        27.0|    4.0|            3.0|            3.5|
|      22362| 26|   0.0|    0.0|   1.8|             0.0|      145963.0|2023|        

25/04/16 19:53:51 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB


In [50]:
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

+-----------+-------+---+------+------+----------------+----+-----+---------------+----------------+-------------+------------+--------+---------------+--------------+-------+--------+--------------+
|Customer_ID|Country|Age|Gender|Income|Customer_Segment|Year|Month|Total_Purchases|Product_Category|Product_Brand|Product_Type|Feedback|Shipping_Method|Payment_Method|Ratings|products|Transaction_ID|
+-----------+-------+---+------+------+----------------+----+-----+---------------+----------------+-------------+------------+--------+---------------+--------------+-------+--------+--------------+
|          0|      0|  0|     0|     0|               0|   0|    0|              0|               0|            0|           0|       0|              0|             0|      0|       0|             0|
+-----------+-------+---+------+------+----------------+----+-----+---------------+----------------+-------------+------------+--------+---------------+--------------+-------+--------+--------------+


### Recommendation Phase 1

1. Generate Recommendations with Ratings

In [51]:
from pyspark.ml.evaluation import RegressionEvaluator

In [52]:
train, test = indexed_df.randomSplit([0.8, 0.2], seed=42)

# Initialize the ALS (Alternating Least Squares) model.
als = ALS(
    userCol="Customer_ID",  
    itemCol="Products",      
    ratingCol="Ratings",     
    rank=10,                  
    maxIter=10,               
    regParam=0.1,             
    coldStartStrategy="drop"  
)

In [53]:
model = als.fit(train)
predictions = model.transform(test)
userRecs = model.recommendForAllUsers(5)
userRecs.show(5, truncate=False)

25/04/16 19:53:52 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:53:53 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:53:54 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:53:55 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:53:55 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:53:56 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:53:57 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:53:58 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:53:58 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:53:59 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:53:59 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:00 WARN DAGScheduler: Broadc

+-----------+----------------------------------------------------------------------------------------+
|Customer_ID|recommendations                                                                         |
+-----------+----------------------------------------------------------------------------------------+
|12         |[{124, 4.6717706}, {40, 4.399662}, {175, 4.3472805}, {267, 4.3231187}, {69, 4.208994}]  |
|13         |[{174, 5.0660167}, {248, 4.833373}, {302, 4.71884}, {36, 4.712051}, {159, 4.6419716}]   |
|14         |[{124, 4.5577626}, {310, 4.3268714}, {72, 4.218303}, {122, 4.086638}, {77, 4.0722814}]  |
|18         |[{14, 3.7602866}, {231, 3.7069128}, {155, 3.5947292}, {200, 3.4845002}, {22, 3.4713295}]|
|38         |[{171, 4.3633294}, {288, 4.1702466}, {227, 4.082798}, {299, 4.066179}, {88, 4.0311594}] |
+-----------+----------------------------------------------------------------------------------------+
only showing top 5 rows



25/04/16 19:54:15 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB
                                                                                

In [54]:
evaluator = RegressionEvaluator(
    metricName="rmse",            
    labelCol="Ratings",     
    predictionCol="prediction"    
)
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error of 1 (RMSE):", rmse)

25/04/16 19:54:16 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:16 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:16 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:17 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB


Root-mean-square error of 1 (RMSE): 2.075946774027161


2. Generate Recommendation with Feedbacks

In [55]:
#spark.stop()

als_fb = ALS(
    userCol="Customer_ID",   
    itemCol="Products",      
    ratingCol="Feedback_Rating",   
    rank=10,                       
    maxIter=10,                    
    regParam=0.1,                  
    coldStartStrategy="drop"       
)

als_model_fb = als_fb.fit(train)
userRecs = als_model_fb.recommendForAllUsers(5)
userRecs.show(5, truncate=False)

25/04/16 19:54:18 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:19 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:20 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:20 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:21 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:21 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:22 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:22 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:23 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:23 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:23 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:24 WARN DAGScheduler: Broadc

+-----------+------------------------------------------------------------------------------------------+
|Customer_ID|recommendations                                                                           |
+-----------+------------------------------------------------------------------------------------------+
|12         |[{124, 3.8721766}, {122, 3.867613}, {128, 3.8306646}, {49, 3.7383}, {120, 3.6640534}]     |
|13         |[{36, 4.2894664}, {127, 4.2145014}, {159, 4.189936}, {195, 4.1831694}, {292, 4.1579185}]  |
|14         |[{124, 3.8338265}, {122, 3.6338186}, {69, 3.443578}, {166, 3.3902285}, {77, 3.366441}]    |
|18         |[{155, 3.3747208}, {231, 3.1471965}, {200, 3.1333094}, {273, 3.1143727}, {122, 3.1120393}]|
|38         |[{171, 3.5413203}, {88, 3.4155073}, {288, 3.4029078}, {299, 3.36233}, {226, 3.345981}]    |
+-----------+------------------------------------------------------------------------------------------+
only showing top 5 rows



In [56]:
predictions = als_model_fb.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",            
    labelCol="Feedback_Rating",     
    predictionCol="prediction"    
)
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error of 1 (RMSE):", rmse)

25/04/16 19:54:38 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:38 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:38 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
                                                                                

Root-mean-square error of 1 (RMSE): 1.6128865285338405


25/04/16 19:54:39 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB


3. Generate Recommendations with Combined Ratings

In [57]:
als_combined = ALS(
    userCol="Customer_ID",   
    itemCol="Products",      
    ratingCol="Combined_Rating",   
    rank=10,                       
    maxIter=10,                    
    regParam=0.1,                  
    coldStartStrategy="drop"       
)

als_model_combined = als_combined.fit(train)
userRecs = als_model_combined.recommendForAllUsers(5)
userRecs.show(5, truncate=False)

predictions = als_model_combined.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",            
    labelCol="Combined_Rating",     
    predictionCol="prediction"    
)
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error of 1 (RMSE):", rmse)

25/04/16 19:54:40 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:41 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:42 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:42 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:43 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:43 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:44 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:44 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:45 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:46 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:46 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:54:47 WARN DAGScheduler: Broadc

+-----------+-----------------------------------------------------------------------------------------+
|Customer_ID|recommendations                                                                          |
+-----------+-----------------------------------------------------------------------------------------+
|12         |[{124, 4.2506056}, {122, 3.9849882}, {40, 3.969384}, {128, 3.9380352}, {267, 3.8983276}] |
|13         |[{174, 4.621201}, {36, 4.4970493}, {159, 4.4098015}, {127, 4.380688}, {248, 4.3614435}]  |
|14         |[{124, 4.1920805}, {122, 3.85035}, {77, 3.7608159}, {310, 3.6747696}, {72, 3.645151}]    |
|18         |[{155, 3.4979124}, {231, 3.4545639}, {14, 3.348528}, {200, 3.3011057}, {273, 3.29994}]   |
|38         |[{171, 3.9489934}, {288, 3.7822936}, {88, 3.7263923}, {299, 3.7139812}, {226, 3.6598186}]|
+-----------+-----------------------------------------------------------------------------------------+
only showing top 5 rows



25/04/16 19:55:01 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:55:02 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 19:55:02 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
                                                                                

Root-mean-square error of 1 (RMSE): 1.8215680342473874


25/04/16 19:55:03 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB


### Recommendation Phase 2

1. User Profile Vector

In [58]:
from pyspark.ml.feature import VectorAssembler

distinct_users = indexed_df.select("Customer_ID", "Age", "Gender", "Country", "Income", "Customer_Segment").distinct()

user_profile_vector = VectorAssembler(
    inputCols=["Customer_ID","Age", "Gender", "Country", "Income", "Customer_Segment"],
    outputCol="user_profile_vector"
).transform(distinct_users)


user_profile_vector.select("Customer_ID", "user_profile_vector").show(5, truncate=False)

25/04/16 19:55:03 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


+-----------+------------------------------+
|Customer_ID|user_profile_vector           |
+-----------+------------------------------+
|38664      |[38664.0,26.0,1.0,2.0,2.0,0.0]|
|58182      |[58182.0,38.0,0.0,0.0,1.0,0.0]|
|22370      |[22370.0,31.0,1.0,4.0,1.5,0.0]|
|4925       |[4925.0,20.0,0.0,3.0,2.2,1.0] |
|58188      |[58188.0,30.0,0.0,0.0,2.0,0.0]|
+-----------+------------------------------+
only showing top 5 rows



25/04/16 19:55:04 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


2. Product Vector

In [59]:

product_vector = VectorAssembler(
    inputCols=["Transaction_ID","Product_Category", "Product_Brand", "Product_Type","Products"],
    outputCol="product_vector"
).transform(indexed_df)


product_vector.select("Products", "product_vector").show(5, truncate=False)

+--------+-----------------------------+
|Products|product_vector               |
+--------+-----------------------------+
|245     |[218625.0,4.0,7.0,27.0,245.0]|
|10      |[145963.0,1.0,1.0,0.0,10.0]  |
|279     |[49995.0,2.0,11.0,22.0,279.0]|
|111     |[210534.0,0.0,2.0,9.0,111.0] |
|182     |[13653.0,2.0,3.0,18.0,182.0] |
+--------+-----------------------------+
only showing top 5 rows



25/04/16 19:55:04 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB


3. Transaction Vector  

In [60]:

transaction_vector = VectorAssembler(
    inputCols=["Transaction_ID","Customer_ID","Month_Index","Year","Combined_Rating"],
    outputCol="transaction_vector"
).transform(indexed_df)


transaction_vector.select("Transaction_ID", "transaction_vector").show(5, truncate=False)

+--------------+---------------------------------+
|Transaction_ID|transaction_vector               |
+--------------+---------------------------------+
|218625.0      |[218625.0,22362.0,7.0,2023.0,3.5]|
|145963.0      |[145963.0,22362.0,5.0,2023.0,4.5]|
|49995.0       |[49995.0,22362.0,1.0,2023.0,3.0] |
|210534.0      |[210534.0,22362.0,7.0,2023.0,2.0]|
|13653.0       |[13653.0,11310.0,10.0,2023.0,4.5]|
+--------------+---------------------------------+
only showing top 5 rows



25/04/16 19:55:05 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB


In [61]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
scaler_user = MinMaxScaler(inputCol="user_profile_vector", outputCol="user_profile_scaled")
scaler_model_user = scaler_user.fit(user_profile_vector)
user_profile_scaled = scaler_model_user.transform(user_profile_vector)

user_profile_scaled.select("Customer_ID", "user_profile_scaled").show(5, truncate=False)

25/04/16 19:55:05 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 19:55:05 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 19:55:06 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 19:55:06 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


+-----------+--------------------------------------------------------------------------+
|Customer_ID|user_profile_scaled                                                       |
+-----------+--------------------------------------------------------------------------+
|38664      |[0.4457253527621507,0.15384615384615385,1.0,0.5,0.5,0.0]                  |
|58182      |(6,[0,1],[0.6707322696670663,0.38461538461538464])                        |
|22370      |[0.2578852716037997,0.25,1.0,1.0,0.25,0.0]                                |
|4925       |[0.05677626118232961,0.038461538461538464,0.0,0.75,0.6000000000000001,0.5]|
|58188      |[0.6708014387162224,0.23076923076923078,0.0,0.0,0.5,0.0]                  |
+-----------+--------------------------------------------------------------------------+
only showing top 5 rows



25/04/16 19:55:06 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


In [62]:
# Then apply scaling
scaler_product = MinMaxScaler(inputCol="product_vector", outputCol="product_vector_scaled")
scaler_model_product = scaler_product.fit(product_vector)
product_scaled = scaler_model_product.transform(product_vector)

product_scaled.select("Products", "product_vector_scaled").show(5, truncate=False)

25/04/16 19:55:06 WARN DAGScheduler: Broadcasting large task binary with size 9.7 MiB
25/04/16 19:55:07 WARN DAGScheduler: Broadcasting large task binary with size 9.7 MiB
                                                                                

+--------+-----------------------------------------------------------------------+
|Products|product_vector_scaled                                                  |
+--------+-----------------------------------------------------------------------+
|245     |[0.7259189162267159,0.8,0.38888888888888884,0.84375,0.7728706624605678]|
|10      |[0.48465318590829104,0.2,0.05555555555555555,0.0,0.031545741324921134] |
|279     |[0.16600258989939237,0.4,0.611111111111111,0.6875,0.8801261829652997]  |
|111     |[0.6990536906066341,0.0,0.1111111111111111,0.28125,0.3501577287066246] |
|182     |[0.04533320051797988,0.4,0.16666666666666666,0.5625,0.5741324921135647]|
+--------+-----------------------------------------------------------------------+
only showing top 5 rows



25/04/16 19:55:08 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB


In [64]:
# Then apply scaling
scaler_transaction = MinMaxScaler(inputCol="transaction_vector", outputCol="transaction_vector_scaled")
scaler_model_transaction = scaler_transaction.fit(transaction_vector)
transaction_vector_scaled = scaler_model_transaction.transform(transaction_vector)

transaction_vector_scaled.select("Transaction_ID", "transaction_vector_scaled").show(5, truncate=False)

25/04/16 19:55:26 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
25/04/16 19:55:27 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
                                                                                

+--------------+---------------------------------------------------------------------------------+
|Transaction_ID|transaction_vector_scaled                                                        |
+--------------+---------------------------------------------------------------------------------+
|218625.0      |[0.7259189162267159,0.2577930462049248,0.5454545454545454,0.0,0.7142857142857142]|
|145963.0      |[0.48465318590829104,0.2577930462049248,0.36363636363636365,0.0,1.0]             |
|49995.0       |[0.16600258989939237,0.2577930462049248,0.0,0.0,0.5714285714285714]              |
|210534.0      |[0.6990536906066341,0.2577930462049248,0.5454545454545454,0.0,0.2857142857142857]|
|13653.0       |[0.04533320051797988,0.13038365765931936,0.8181818181818182,0.0,1.0]             |
+--------------+---------------------------------------------------------------------------------+
only showing top 5 rows



25/04/16 19:55:27 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB


In [83]:
# Make sure all the necessary columns are selected
user_profile_scaled = user_profile_scaled.select("Customer_ID", "user_profile_scaled")
product_scaled = product_scaled.select("Transaction_ID", "Products", "product_vector_scaled")
transaction_scaled = transaction_vector_scaled.select(
    "Transaction_ID", "Customer_ID", "transaction_vector_scaled", "Combined_Rating"
)# Join user and transaction data on Customer_ID
user_transaction_joined = transaction_scaled.join(user_profile_scaled, on="Customer_ID", how="inner")

# Now join with product data on Transaction_ID
hybrid_joined = user_transaction_joined.join(product_scaled, on="Transaction_ID", how="inner")

hybrid_assembler = VectorAssembler(
    inputCols=["user_profile_scaled", "transaction_vector_scaled", "product_vector_scaled"],
    outputCol="hybrid_vector"
)

hybrid_df = hybrid_assembler.transform(hybrid_joined)
hybrid_df.select("Customer_ID", "Products", "hybrid_vector", "Combined_Rating").show(5, truncate=False)

25/04/16 20:44:58 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 20:44:58 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/16 20:44:59 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/16 20:45:00 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB


+-----------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|Customer_ID|Products|hybrid_vector                                                                                                                                                                                                                 |Combined_Rating|
+-----------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|38664      |29      |[0.4457253527621507,0.15384615384615385,1.0,0.5,0.5,0.0,0.8020984825845868,0.4457253527621507,0.8181818181818182,0.0,1.0,0.8020984825845868,0.6000000000000001,0.5,0.0625,0.0914826498422713]   

In [84]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.ml.linalg import DenseVector, VectorUDT
import numpy as np

user_vectors = hybrid_df.select("Customer_ID", "hybrid_vector").distinct()

# 1. Önce vektörleri aynı product altında toplayalım
vector_list_df = hybrid_df.groupBy("Products").agg(
    F.first("Transaction_ID").alias("Transaction_ID"),
    F.first("Customer_ID").alias("Customer_ID"),
    F.collect_list("hybrid_vector").alias("hybrid_vector_list")
)

# 2. UDF: Vektörleri ortalamak için Python fonksiyonu
def average_vectors(vectors):
    if not vectors:
        return None
    arrays = [np.array(v.toArray()) for v in vectors]
    avg_array = np.mean(arrays, axis=0)
    return avg_array.tolist()

average_vectors_udf = F.udf(average_vectors, ArrayType(DoubleType()))

# 3. Ortalamasını al
vector_list_df = vector_list_df.withColumn(
    "avg_vector_array", average_vectors_udf("hybrid_vector_list")
)

# 4. (Opsiyonel) Tekrar DenseVector'e çevir
def array_to_vector(arr):
    if arr is None:
        return None
    return DenseVector(arr)

array_to_vector_udf = F.udf(array_to_vector, VectorUDT())

# 5. Son hali
product_vectors = vector_list_df.withColumn(
    "product_vector", array_to_vector_udf("avg_vector_array")
).select("Products", "product_vector")

product_vectors.show(truncate=False)


25/04/16 20:45:07 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 20:45:07 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/16 20:45:08 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/16 20:45:09 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 20:45:10 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
[Stage 1439:>                                                       (0 + 1) / 1]

+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Products|product_vector                                                                                                                                                                                                                                                                                                            |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|12      |[0.359917311

                                                                                

In [85]:
from pyspark.sql.functions import udf
def cosine_similarity(vec1, vec2):
    if vec1 is None or vec2 is None:
        return None
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return float(np.dot(vec1, vec2) / (norm1 * norm2))

cosine_sim_udf = udf(cosine_similarity, DoubleType())

In [86]:
# Cross join: every user with every product
user_product_scores = user_vectors.crossJoin(product_vectors)

# Add cosine similarity
user_product_scores = user_product_scores.withColumn(
    "similarity", cosine_sim_udf("hybrid_vector", "product_vector")
)

In [87]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec = Window.partitionBy("Customer_ID").orderBy(F.desc("similarity"))

top_n_recommendations = user_product_scores.withColumn("rank", row_number().over(windowSpec)) \
    .filter("rank <= 5")

top_n_recommendations.select("Customer_ID", "Products", "similarity").show(truncate=False)


25/04/16 20:45:12 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 20:45:13 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/16 20:45:13 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/16 20:45:14 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/16 20:45:15 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 20:45:15 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 20:45:17 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 20:50:40 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB


+-----------+--------+------------------+
|Customer_ID|Products|similarity        |
+-----------+--------+------------------+
|12         |212     |0.8861749803080519|
|12         |213     |0.8837212971417542|
|12         |242     |0.8815541763986517|
|12         |239     |0.8814559570708499|
|12         |269     |0.881112519992375 |
|13         |269     |0.9080098769164253|
|13         |213     |0.9077187307490607|
|13         |292     |0.9048125536952083|
|13         |236     |0.9028009729000578|
|13         |275     |0.9026139564873609|
|14         |184     |0.8930218505842851|
|14         |181     |0.8905888486812419|
|14         |203     |0.8903263974207459|
|14         |205     |0.8889383055353763|
|14         |215     |0.8842558192230076|
|18         |286     |0.8985492299457141|
|18         |243     |0.8975734158204195|
|18         |315     |0.8969658103699043|
|18         |239     |0.896063901513035 |
|18         |212     |0.8958413647800829|
+-----------+--------+------------

                                                                                

In [88]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=5, seed=42, featuresCol="hybrid_vector", predictionCol="cluster")
kmeans_model = kmeans.fit(user_vectors)

user_clusters = kmeans_model.transform(user_vectors)
user_clusters.select("Customer_ID", "cluster").show(10)

25/04/16 20:50:41 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 20:50:41 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/16 20:50:42 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/16 20:50:43 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 20:50:44 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 20:50:44 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/16 20:50:45 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/16 20:50:46 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 20:50:47 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 20:50:48 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 20:50:48 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 20:50:49 WARN DAGScheduler: Broadcastin

+-----------+-------+
|Customer_ID|cluster|
+-----------+-------+
|      38664|      0|
|      58182|      2|
|      22370|      4|
|       4925|      1|
|       4927|      4|
|       1962|      2|
|      38686|      2|
|      22394|      3|
|      58209|      1|
|       1964|      1|
+-----------+-------+
only showing top 10 rows



25/04/16 20:51:12 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
                                                                                

In [89]:
# Join cluster info back to hybrid_df
clustered_data = hybrid_df.join(user_clusters.select("Customer_ID", "cluster"), on="Customer_ID")

# Most popular products in each cluster (based on avg Combined_Rating or frequency)
popular_by_cluster = clustered_data.groupBy("cluster", "Products") \
    .agg(F.avg("Combined_Rating").alias("avg_rating")) \
    .orderBy("cluster", F.desc("avg_rating"))

popular_by_cluster.show(10, truncate=False)

25/04/16 20:51:13 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 20:51:13 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/04/16 20:51:14 WARN DAGScheduler: Broadcasting large task binary with size 8.2 MiB
25/04/16 20:51:14 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
25/04/16 20:51:15 WARN DAGScheduler: Broadcasting large task binary with size 10.5 MiB
25/04/16 20:51:15 WARN DAGScheduler: Broadcasting large task binary with size 10.6 MiB
25/04/16 20:51:16 WARN DAGScheduler: Broadcasting large task binary with size 12.4 MiB
25/04/16 20:51:16 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
25/04/16 20:51:17 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB
[Stage 1615:>                                                       (0 + 8) / 8]

+-------+--------+------------------+
|cluster|Products|avg_rating        |
+-------+--------+------------------+
|0      |317     |4.168067226890757 |
|0      |121     |3.8320964749536177|
|0      |123     |3.813053097345133 |
|0      |124     |3.7895277207392195|
|0      |126     |3.7858880778588806|
|0      |122     |3.772727272727273 |
|0      |128     |3.7676470588235293|
|0      |125     |3.7367816091954023|
|0      |127     |3.7314629258517034|
|0      |120     |3.675704989154013 |
+-------+--------+------------------+
only showing top 10 rows



25/04/16 20:51:19 WARN DAGScheduler: Broadcasting large task binary with size 12.5 MiB
                                                                                