In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline


In [2]:
# Create or get your Spark session
spark = SparkSession.builder.appName("StringIndexExample").getOrCreate()

In [14]:
# Load the CSV file into a DataFrame
df = spark.read.option("header", "true").csv("D:\\1CS\\Bil401\\RetailAnalysis\\new_cleaned_data.csv")

# List of string columns to index (modify based on your actual use case)
string_columns = [
    "Customer_ID", "Country", "Gender", "Customer_Segment",
    "Product_Category", "Product_Brand", "Product_Type", "Feedback",
    "Shipping_Method", "Payment_Method", "Ratings","products"
]

# Create StringIndexer stages for each column
indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid="keep")
    for col in string_columns
]

# Create and fit the Pipeline
pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(df).transform(df)

# Show all columns including the indexed ones
#indexed_df.show(truncate=False)


In [15]:
# Build a list of columns to display:
# Start with "Customer_ID", then for each column, add the original and its indexed version.
columns_to_show = ["Customer_ID"] + [col for s in string_columns for col in (s, s + "_index")]

# Display the first 20 rows of the selected columns.
indexed_df.select(*columns_to_show).show(5)


+-----------+-----------+-----------------+-------+-------------+------+------------+----------------+----------------------+----------------+----------------------+-----------------+-------------------+------------+------------------+---------+--------------+---------------+---------------------+--------------+--------------------+-------+-------------+------------------+--------------+
|Customer_ID|Customer_ID|Customer_ID_index|Country|Country_index|Gender|Gender_index|Customer_Segment|Customer_Segment_index|Product_Category|Product_Category_index|    Product_Brand|Product_Brand_index|Product_Type|Product_Type_index| Feedback|Feedback_index|Shipping_Method|Shipping_Method_index|Payment_Method|Payment_Method_index|Ratings|Ratings_index|          products|products_index|
+-----------+-----------+-----------------+-------+-------------+------+------------+----------------+----------------------+----------------+----------------------+-----------------+-------------------+------------+--

In [20]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Cast the Ratings column to float (if not already numeric)
indexed_df = indexed_df.withColumn("Ratings_index", indexed_df["Ratings_index"])

# Split the data into training and test sets
train, test = indexed_df.randomSplit([0.8, 0.2], seed=42)

# Initialize the ALS model
als = ALS(
    userCol="Customer_ID_index",   # User identifier (numeric)
    itemCol="products_index",      # Item identifier (numeric)
    ratingCol="Ratings_index",           # Rating column
    rank=10,                       # Number of latent factors
    maxIter=10,                    # Number of iterations
    regParam=0.1,                  # Regularization parameter
    coldStartStrategy="drop"       # Drop NaN predictions during evaluation
)

# Fit the model on the training data
model = als.fit(train)

# Generate predictions on the test set
predictions = model.transform(test)

# Evaluate the model using Root Mean Squared Error (RMSE)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Ratings_index", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error (RMSE):", rmse)

# Generate top 5 product recommendations for each user
userRecs = model.recommendForAllUsers(5)
userRecs.show(5, truncate=False)


Root-mean-square error (RMSE): 1.8949615546580254
+-----------------+-----------------------------------------------------------------------------------------+
|Customer_ID_index|recommendations                                                                          |
+-----------------+-----------------------------------------------------------------------------------------+
|31               |[{114, 3.365354}, {185, 3.232731}, {119, 3.0735805}, {107, 3.0631726}, {68, 3.0317197}]  |
|34               |[{156, 2.8354557}, {227, 2.6421156}, {233, 2.4985762}, {280, 2.4845135}, {44, 2.4782188}]|
|53               |[{23, 3.0790992}, {241, 2.598433}, {267, 2.5407896}, {282, 2.4733748}, {225, 2.3913903}] |
|65               |[{133, 3.3341796}, {301, 3.1203952}, {306, 3.0393007}, {286, 2.950754}, {196, 2.8867276}]|
|78               |[{110, 3.6403105}, {301, 3.5453634}, {25, 3.432964}, {290, 3.330237}, {138, 3.3069453}]  |
+-----------------+---------------------------------------------------