In [86]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import when, col

In [87]:
# Create or get your Spark session
spark = SparkSession.builder.appName("StringIndexExample").getOrCreate()

In [77]:
# # Stop current SparkSession
# spark.stop()

# # Restart SparkSession
# from pyspark.sql import SparkSession
# spark = SparkSession.builder.master("local[*]").appName("FixPythonWorker").getOrCreate()


In [88]:
# Load the CSV file into a DataFrame
df = spark.read.option("header", "true").csv("D:\\1CS\\Bil401\\RetailAnalysis\\new_cleaned_data.csv")

# List of string columns to index (modify based on your actual use case)
string_columns = [
    "Customer_ID", "Country", "Gender", "Customer_Segment",
    "Product_Category", "Product_Brand", "Product_Type",
    "Shipping_Method", "Payment_Method","products", "Transaction_ID"
]

# Create StringIndexer stages for each column
indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep")
    for col in string_columns
]

# Create and fit the Pipeline
pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(df).transform(df)

# Show all columns including the indexed ones
#indexed_df.show(truncate=False)


In [89]:
# Build a list of columns to display:
# Start with "Customer_ID", then for each column, add the original and its indexed version.
columns_to_show = [col for s in string_columns for col in (s, s + "_index")]

# Display the first 20 rows of the selected columns.
indexed_df.select(*columns_to_show).show(5)


+-----------+-----------------+-------+-------------+------+------------+----------------+----------------------+----------------+----------------------+-----------------+-------------------+------------+------------------+---------------+---------------------+--------------+--------------------+------------------+--------------+--------------+--------------------+
|Customer_ID|Customer_ID_index|Country|Country_index|Gender|Gender_index|Customer_Segment|Customer_Segment_index|Product_Category|Product_Category_index|    Product_Brand|Product_Brand_index|Product_Type|Product_Type_index|Shipping_Method|Shipping_Method_index|Payment_Method|Payment_Method_index|          products|products_index|Transaction_ID|Transaction_ID_index|
+-----------+-----------------+-------+-------------+------+------------+----------------+----------------------+----------------+----------------------+-----------------+-------------------+------------+------------------+---------------+---------------------+---

In [90]:
# =====================================================
# 1. User Feature Profiles (user_features_df)
# =====================================================

# Group by user to extract user-specific attributes
user_df = indexed_df.groupBy("Customer_ID_index").agg(
    F.first("Country").alias("Country"),
    F.first(F.col("Age").cast("double")).alias("Age"),
    F.first(F.col("Income").cast("double")).alias("Income"),
    F.first("Gender").alias("Gender"),
    F.first("Customer_Segment").alias("Customer_Segment")
)

# Define transformations for categorical user attributes.
# For Country:
country_indexer = StringIndexer(inputCol="Country", outputCol="Country_index", handleInvalid="keep")
country_encoder = OneHotEncoder(inputCol="Country_index", outputCol="Country_encoded")

# Define transformations for categorical user attributes.
# For Gender:
gender_indexer = StringIndexer(inputCol="Gender", outputCol="Gender_index", handleInvalid="keep")
gender_encoder = OneHotEncoder(inputCol="Gender_index", outputCol="Gender_encoded")

# For Customer_Segment:
segment_indexer = StringIndexer(inputCol="Customer_Segment", outputCol="Customer_Segment_index", handleInvalid="keep")
segment_encoder = OneHotEncoder(inputCol="Customer_Segment_index", outputCol="Customer_Segment_encoded")

# Assemble all user features into a single vector.
user_assembler = VectorAssembler(
    inputCols=["Country_encoded","Age", "Income", "Gender_encoded", "Customer_Segment_encoded"],
    outputCol="user_features"
)

# Build and apply the pipeline for user features.
user_pipeline = Pipeline(stages=[country_indexer, country_encoder, gender_indexer, gender_encoder, segment_indexer, segment_encoder, user_assembler])
user_model = user_pipeline.fit(user_df)
user_features_df = user_model.transform(user_df)

print("User feature headers:", user_assembler.getInputCols())
# Show a sample of user features.
user_features_df.select("Customer_ID_index", "user_features").show(5, truncate=False)


User feature headers: ['Country_encoded', 'Age', 'Income', 'Gender_encoded', 'Customer_Segment_encoded']
+-----------------+---------------------------------------+
|Customer_ID_index|user_features                          |
+-----------------+---------------------------------------+
|0.0              |(12,[1,5,6,8,9],[1.0,46.0,1.7,1.0,1.0])|
|1.0              |(12,[0,5,6,7,9],[1.0,22.0,2.2,1.0,1.0])|
|7.0              |(12,[1,5,6,7,9],[1.0,20.0,2.1,1.0,1.0])|
|8.0              |(12,[3,5,6,8,9],[1.0,20.0,2.0,1.0,1.0])|
|18.0             |(12,[2,5,6,7,9],[1.0,46.0,2.1,1.0,1.0])|
+-----------------+---------------------------------------+
only showing top 5 rows



In [91]:
# =====================================================
# 2. Generate Item Feature Profiles (item_features_df)
# =====================================================

# Group by item to extract item-specific attributes.
item_df = indexed_df.groupBy("products_index").agg(
    F.first("Product_Category").alias("Product_Category"),
    F.first("Product_Brand").alias("Product_Brand"),
    F.first("Product_Type").alias("Product_Type")
)

# Define transformations for categorical item attributes.
# For Product_Category:
cat_indexer = StringIndexer(inputCol="Product_Category", outputCol="Product_Category_index", handleInvalid="keep")
cat_encoder = OneHotEncoder(inputCol="Product_Category_index", outputCol="Product_Category_encoded")

# For Product_Brand:
brand_indexer = StringIndexer(inputCol="Product_Brand", outputCol="Product_Brand_index", handleInvalid="keep")
brand_encoder = OneHotEncoder(inputCol="Product_Brand_index", outputCol="Product_Brand_encoded")

# For Product_Type:
type_indexer = StringIndexer(inputCol="Product_Type", outputCol="Product_Type_index", handleInvalid="keep")
type_encoder = OneHotEncoder(inputCol="Product_Type_index", outputCol="Product_Type_encoded")

# Assemble all item features into a single vector.
item_assembler = VectorAssembler(
    inputCols=["Product_Category_encoded", "Product_Brand_encoded", "Product_Type_encoded"],
    outputCol="item_features"
)

# Build and apply the pipeline for item features.
item_pipeline = Pipeline(stages=[cat_indexer, cat_encoder, brand_indexer, brand_encoder, type_indexer, type_encoder, item_assembler])
item_model = item_pipeline.fit(item_df)
item_features_df = item_model.transform(item_df)

print("Item feature headers:", item_assembler.getInputCols())
# Show a sample of item features.
item_features_df.select("products_index", "item_features").show(5, truncate=False)


Item feature headers: ['Product_Category_encoded', 'Product_Brand_encoded', 'Product_Type_encoded']
+--------------+----------------------------+
|products_index|item_features               |
+--------------+----------------------------+
|0.0           |(57,[3,18,52],[1.0,1.0,1.0])|
|1.0           |(57,[4,13,31],[1.0,1.0,1.0])|
|2.0           |(57,[3,17,52],[1.0,1.0,1.0])|
|3.0           |(57,[3,17,52],[1.0,1.0,1.0])|
|4.0           |(57,[3,17,52],[1.0,1.0,1.0])|
+--------------+----------------------------+
only showing top 5 rows



In [92]:
#Transform feedback and rating to integer values
indexed_df = indexed_df.withColumn(
    "feedback_rating",
    when(col("feedback") == "Bad", 1.0)
    .when(col("feedback") == "Average", 2.0)
    .when(col("feedback") == "Good", 3.0)
    .when(col("feedback") == "Excellent", 4.0)
    .otherwise(None)
)

# Cast Ratings column to Double
indexed_df = indexed_df.withColumn("Ratings", col("Ratings").cast("Double"))
indexed_df = indexed_df.withColumn("Age", col("Age").cast("Int"))
indexed_df = indexed_df.withColumn("Income", col("Income").cast("Double"))
indexed_df.show(5)


+-----------+-------+---+------+------+----------------+------+-------+---------------+----------------+-----------------+------------+---------+---------------+--------------+-------+------------------+--------------+-----------------+-------------+------------+----------------------+----------------------+-------------------+------------------+---------------------+--------------------+--------------+--------------------+---------------+
|Customer_ID|Country|Age|Gender|Income|Customer_Segment|  Year|  Month|Total_Purchases|Product_Category|    Product_Brand|Product_Type| Feedback|Shipping_Method|Payment_Method|Ratings|          products|Transaction_ID|Customer_ID_index|Country_index|Gender_index|Customer_Segment_index|Product_Category_index|Product_Brand_index|Product_Type_index|Shipping_Method_index|Payment_Method_index|products_index|Transaction_ID_index|feedback_rating|
+-----------+-------+---+------+------+----------------+------+-------+---------------+----------------+--------

In [93]:
# Weight parameter
alpha = 0.5

# Create a new column "Combined_Rating" as a weighted average of Ratings and Feedback_rating.
indexed_df = indexed_df.withColumn(
    "Combined_Rating",
    alpha * col("Ratings") + (1 - alpha) * col("Feedback_rating")
)

indexed_df.select("Ratings", "Feedback_rating", "Combined_Rating").show(5)


+-------+---------------+---------------+
|Ratings|Feedback_rating|Combined_Rating|
+-------+---------------+---------------+
|    4.0|            3.0|            3.5|
|    5.0|            4.0|            4.5|
|    3.0|            3.0|            3.0|
|    2.0|            2.0|            2.0|
|    5.0|            4.0|            4.5|
+-------+---------------+---------------+
only showing top 5 rows



In [None]:
#CLEANING
from pyspark.sql.types import StringType

# Get all string-type columns
string_cols = [f.name for f in indexed_df.schema.fields if isinstance(f.dataType, StringType)]

# Add 'Customer_ID' to the list of columns to drop
cols_to_drop = string_cols + ['Customer_ID']

# Drop them
indexed_df = indexed_df.drop(*cols_to_drop)

# Show remaining columns
print(indexed_df.columns)



['Age', 'Income', 'Ratings', 'Customer_ID_index', 'Country_index', 'Gender_index', 'Customer_Segment_index', 'Product_Category_index', 'Product_Brand_index', 'Product_Type_index', 'Shipping_Method_index', 'Payment_Method_index', 'products_index', 'Transaction_ID_index', 'feedback_rating', 'Combined_Rating']


# Recommendation Phase

### 1. Generate Recommendations with Ratings

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Split the data into training (80%) and test (20%) sets.
# The seed parameter ensures reproducibility.
train, test = indexed_df.randomSplit([0.8, 0.2], seed=42)

# Initialize the ALS (Alternating Least Squares) model.
# This model is used for collaborative filtering based on user-item interactions.
als = ALS(
    userCol="Customer_ID_index",   # The numeric user ID column.
    itemCol="products_index",      # The numeric item (product) ID column.
    ratingCol="Ratings_index",     # The column containing the ratings.
    rank=10,                       # The number of latent factors to use.
    maxIter=10,                    # The maximum number of iterations to run.
    regParam=0.1,                  # The regularization parameter to prevent overfitting.
    coldStartStrategy="drop"       # Drop any rows in the test set that result in NaN predictions.
)

# Fit the ALS model using the training data.
model = als.fit(train)

# Use the trained model to generate predictions on the test data.
predictions = model.transform(test)

# Generate the top 5 product recommendations for each user.
# The output DataFrame includes each user (using their numeric ID)
# and an array of the top 5 recommendations (each with an item ID and a predicted rating).
userRecs = model.recommendForAllUsers(5)

# Display the first 5 rows of the recommendations DataFrame without truncating the output.
userRecs.show(5, truncate=False)

+-----------------+-----------------------------------------------------------------------------------------+
|Customer_ID_index|recommendations                                                                          |
+-----------------+-----------------------------------------------------------------------------------------+
|31               |[{114, 3.365354}, {185, 3.232731}, {119, 3.0735805}, {107, 3.0631726}, {68, 3.0317197}]  |
|34               |[{156, 2.8354557}, {227, 2.6421156}, {233, 2.4985762}, {280, 2.4845135}, {44, 2.4782188}]|
|53               |[{23, 3.0790992}, {241, 2.598433}, {267, 2.5407896}, {282, 2.4733748}, {225, 2.3913903}] |
|65               |[{133, 3.3341796}, {301, 3.1203952}, {306, 3.0393007}, {286, 2.950754}, {196, 2.8867276}]|
|78               |[{110, 3.6403105}, {301, 3.5453634}, {25, 3.432964}, {290, 3.330237}, {138, 3.3069453}]  |
+-----------------+-----------------------------------------------------------------------------------------+
only showi

In [25]:
# Evaluate the model's performance using Root Mean Squared Error (RMSE).
# RMSE provides a measure of how far off the predictions are from the actual ratings.
evaluator = RegressionEvaluator(
    metricName="rmse",            # Metric to evaluate the predictions.
    labelCol="Ratings_index",     # The true ratings.
    predictionCol="prediction"    # The predicted ratings from the model.
)
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error of 1 (RMSE):", rmse)

Root-mean-square error of 1 (RMSE): 1.8949615546580254


### 2. Generate Recommendation with of Feedbacks

In [104]:
# Split the data into training (80%) and test (20%) sets for the feedback model.
train_fb, test_fb = indexed_df.randomSplit([0.8, 0.2], seed=42)

# Initialize the ALS model using the feedback-based rating column.
als_fb = ALS(
    userCol="Customer_ID_index",   # User identifier (numeric)
    itemCol="products_index",      # Item identifier (numeric)
    ratingCol="feedback_rating",   # Use the numeric feedback as the rating signal
    rank=10,                       # Number of latent factors
    maxIter=10,                    # Number of iterations
    regParam=0.1,                  # Regularization parameter
    coldStartStrategy="drop"       # Drop NaN predictions during evaluation
)

# Train the ALS model on the full indexed dataset.
als_model_fb = als_fb.fit(indexed_df)

# Generate top 5 recommendations for every user.
userRecs = als_model_fb.recommendForAllUsers(5)

# Display a sample of the ALS recommendations.
userRecs.show(5, truncate=False)


+-----------------+-----------------------------------------------------------------------------------------+
|Customer_ID_index|recommendations                                                                          |
+-----------------+-----------------------------------------------------------------------------------------+
|31               |[{317, 4.0936956}, {181, 4.0158234}, {121, 4.008103}, {140, 3.9531443}, {104, 3.9278402}]|
|34               |[{127, 3.9885442}, {47, 3.8163028}, {172, 3.7226963}, {299, 3.7008548}, {232, 3.6916926}]|
|53               |[{124, 3.7288048}, {125, 3.6961417}, {317, 3.574184}, {12, 3.5740798}, {5, 3.560285}]    |
|65               |[{121, 3.7714088}, {124, 3.714796}, {1, 3.7112274}, {109, 3.7063365}, {147, 3.491488}]   |
|78               |[{317, 4.4578514}, {124, 4.2468185}, {128, 3.9579995}, {121, 3.941046}, {127, 3.8532882}]|
+-----------------+-----------------------------------------------------------------------------------------+
only showi

In [105]:
# Generate predictions on the test set.
predictions_fb = als_model_fb.transform(test_fb)

# Evaluate the model's performance using Root Mean Squared Error (RMSE).
# RMSE provides a measure of how far off the predictions are from the actual ratings.
evaluator_fb = RegressionEvaluator(
    metricName="rmse",            # Metric to evaluate the predictions.
    labelCol="feedback_rating",     # The true ratings.
    predictionCol="prediction"    # The predicted ratings from the model.
)
rmse = evaluator_fb.evaluate(predictions_fb)
print("Root-mean-square error of Feedback model (RMSE):", rmse)


Root-mean-square error of Feedback model (RMSE): 0.3324427998719281
