Data Import

In [None]:
import pandas as pd

file_path = "/content/drive/MyDrive/BDM_PROJECT/Online_Retail.csv"
df = pd.read_csv(file_path, encoding='ISO-8859-1')

df.head()


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01/12/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01/12/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01/12/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01/12/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01/12/10 8:26,3.39,17850.0,United Kingdom


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Data cleansing

In [None]:
# Check structure and nulls
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


Unnamed: 0,0
InvoiceNo,0
StockCode,0
Description,1454
Quantity,0
InvoiceDate,0
UnitPrice,0
CustomerID,135080
Country,0


In [None]:
# Remove entries with missing CustomerID
df = df.dropna(subset=['CustomerID'])


In [None]:
# Remove cancelled transactions
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]


In [None]:
# Convert types
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype(str)
df['StockCode'] = df['StockCode'].astype(str)


  df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])


In [None]:
# Keep only positive purchase quantities
df = df[df['Quantity'] > 0]


In [None]:
# Useful for behavior analysis
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']


In [None]:
df.head()
df.describe()
df['CustomerID'].nunique(), df['StockCode'].nunique()


(4339, 3665)

In [None]:
df.to_csv("cleaned_data.csv", index=False)

In [None]:
interaction_matrix = df.pivot_table(
    index='CustomerID',     # Users
    columns='StockCode',    # Items
    values='Quantity',      # Interaction metric
    aggfunc='sum',          # Total quantity
    fill_value=0            # Fill missing with 0
)


In [None]:
interaction_matrix.shape
interaction_matrix.head()


StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214V,90214W,90214Y,90214Z,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
12349.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12350.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
binary_matrix = interaction_matrix.copy()
binary_matrix[binary_matrix > 0] = 1


In [None]:
# Save for future steps
interaction_matrix.to_csv("interaction_matrix.csv")


In [None]:
%pip install pyspark




In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator


In [None]:
spark = SparkSession.builder \
    .appName("EcommerceRecommendation") \
    .getOrCreate()


In [None]:
from sklearn.preprocessing import LabelEncoder

product_encoder = LabelEncoder()
df['ProductID'] = product_encoder.fit_transform(df['StockCode'])


In [None]:
spark_df = spark.createDataFrame(df[['CustomerID', 'ProductID', 'Quantity']])


In [None]:
from pyspark.sql.functions import col

# Convert CustomerID and ProductID to integers (Quantity is already numeric)
spark_df = spark_df.withColumn("CustomerID", col("CustomerID").cast("int"))
spark_df = spark_df.withColumn("ProductID", col("ProductID").cast("int"))


In [None]:
(training, test) = spark_df.randomSplit([0.8, 0.2], seed=42)

als = ALS(
    userCol="CustomerID",
    itemCol="ProductID",
    ratingCol="Quantity",
    nonnegative=True,
    implicitPrefs=False,
    coldStartStrategy="drop"
)

model = als.fit(training)


In [None]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="Quantity",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse:.2f}")


RMSE: 66.53


In [None]:
user_id = 12346  # change this to a real user from your dataset
user_df = spark.createDataFrame([(user_id,)], ["CustomerID"])
user_recs = model.recommendForUserSubset(user_df, 5)
user_recs.show(truncate=False)


+----------+----------------------------------------------------------------------------------------+
|CustomerID|recommendations                                                                         |
+----------+----------------------------------------------------------------------------------------+
|12346     |[{1997, 74215.0}, {3026, 10513.51}, {2898, 9378.983}, {99, 7324.2656}, {108, 5661.6265}]|
+----------+----------------------------------------------------------------------------------------+



In [None]:
product_id_map = pd.DataFrame({
    'ProductID': list(range(len(product_encoder.classes_))),
    'StockCode': product_encoder.inverse_transform(list(range(len(product_encoder.classes_))))
})

# Display recommendations
for rec in user_recs.collect()[0]['recommendations']:
    stock_code = product_id_map[product_id_map['ProductID'] == rec['ProductID']]['StockCode'].values[0]
    print(f"Recommended Product: {stock_code} | Score: {rec['rating']:.2f}")


Recommended Product: 23166 | Score: 74215.00
Recommended Product: 84826 | Score: 10513.51
Recommended Product: 84568 | Score: 9378.98
Recommended Product: 17096 | Score: 7324.27
Recommended Product: 18007 | Score: 5661.63


API Integration

In [None]:
!pip install pyngrok




In [None]:
from pyngrok import ngrok

# Set your ngrok authtoken (do NOT share this publicly again)
ngrok.set_auth_token("2zYgQUfYwu5htkZqEoPe15yDVgE_5Qx68FnM4qDEHa7T4HAgH")


In [None]:
from flask import Flask, request, jsonify
import threading
from pyngrok import ngrok

app = Flask(__name__)

@app.route("/recommendations", methods=["GET"])
def recommend_products():
    user_id = request.args.get("user_id")

    try:
        user_id_int = int(user_id)
        user_df = spark.createDataFrame([(user_id_int,)], ["CustomerID"])
        user_recs = model.recommendForUserSubset(user_df, 5).collect()

        if not user_recs:
            return jsonify({"user_id": user_id, "message": "No recommendations found"})

        recs = user_recs[0]['recommendations']
        result = []
        for rec in recs:
            pid = rec['ProductID']
            score = rec['rating']
            stock_code = product_id_map[product_id_map['ProductID'] == pid]['StockCode'].values[0]
            result.append({"stock_code": stock_code, "score": round(score, 2)})

        return jsonify({"user_id": user_id, "recommendations": result})
    except Exception as e:
        return jsonify({"error": str(e)})

# Run Flask in a thread so it doesn't block
def run_flask():
    app.run()

# Start the Flask app in the background
thread = threading.Thread(target=run_flask)
thread.start()

# Create the public ngrok URL
public_url = ngrok.connect(5000)
print("✅ Your public API is live at:", public_url)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


✅ Your public API is live at: NgrokTunnel: "https://ece3-35-225-41-59.ngrok-free.app" -> "http://localhost:5000"
