FYP - SHESelect Hybrid Recommendation Model using Collaborative Filtering & Matrix Factorisation algorithm (LightFM, SVD)

In [None]:
#install compatible numpy version and pandas version
!pip install numpy==1.24.4

#install all necessary libraries
!pip install --no-cache-dir lightfm scikit-learn pandas surprise scikit-surprise


Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m184.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: lightfm, scikit-surprise
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=831162 sha256=7ecf46e77018

In [None]:
# import all necessary libraries for model training
# import scikit-learn and lightFM libraries
import pandas as pd
import numpy as np
import re
import spacy
#LightFM model
from lightfm import LightFM
from scipy.sparse import csr_matrix
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
from lightfm.cross_validation import random_train_test_split
#SVD model
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split, GridSearchCV
from surprise.accuracy import rmse
#Evaluation library
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score #Regression metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix #Classification metrics

In [None]:
#Load original data
data = pd.read_csv('female_customer_transactions.csv', low_memory=False)
print(data.head())

  Customer-ID     ProductID  PurchaseQuantity            Categories  \
0  CUSTIDXX-1   PRODOXIID22                10      Meat and Seafood   
1  CUSTIDXX-2   PRODOXIID42                 7          Frozen Foods   
2  CUSTIDXX-3   PRODOXIID64                22  Snacks and Beverages   
3  CUSTIDXX-5  PRODOXIID213                13        Toys and Games   
4  CUSTIDXX-7  PRODOXIID288                21          Ethnic Foods   

                    Products     CustomerName Gender  
0                       Pork      Brian Weiss      F  
1               Frozen Meals  Samantha Wagner      F  
2                       Nuts       Karen Wise      F  
3              Action Figure     Douglas Bond      F  
4      Middle Eastern Hummus      Megan Clark      F  


In [None]:
#FILTER OUT FEMALE CONSUMERS DATA
salesdata = data[data.Gender == 'F']
print(salesdata.head())

  Customer-ID     ProductID  PurchaseQuantity            Categories  \
0  CUSTIDXX-1   PRODOXIID22                10      Meat and Seafood   
1  CUSTIDXX-2   PRODOXIID42                 7          Frozen Foods   
2  CUSTIDXX-3   PRODOXIID64                22  Snacks and Beverages   
3  CUSTIDXX-5  PRODOXIID213                13        Toys and Games   
4  CUSTIDXX-7  PRODOXIID288                21          Ethnic Foods   

                    Products     CustomerName Gender  
0                       Pork      Brian Weiss      F  
1               Frozen Meals  Samantha Wagner      F  
2                       Nuts       Karen Wise      F  
3              Action Figure     Douglas Bond      F  
4      Middle Eastern Hummus      Megan Clark      F  


In [None]:
#EXTRACT IMPORTANT COLUMNS SIGNIFICANTLY FOR TRAINING
sales = salesdata[['Customer-ID','Products','Categories', 'PurchaseQuantity']] #Extracted customer id, products, categories, purchase quantity
print(sales.head()) #print few first rows of the selected column

  Customer-ID                   Products            Categories  \
0  CUSTIDXX-1                       Pork      Meat and Seafood   
1  CUSTIDXX-2               Frozen Meals          Frozen Foods   
2  CUSTIDXX-3                       Nuts  Snacks and Beverages   
3  CUSTIDXX-5              Action Figure        Toys and Games   
4  CUSTIDXX-7      Middle Eastern Hummus          Ethnic Foods   

   PurchaseQuantity  
0                10  
1                 7  
2                22  
3                13  
4                21  


In [None]:
#HANDLE MISSING VALUES AND DUPLICATES FOR SELECTED COLUMNS
sales.dropna(inplace=True) #remove missing values
sales.drop_duplicates(inplace=True) #remove duplicate values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales.dropna(inplace=True) #remove missing values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales.drop_duplicates(inplace=True) #remove duplicate values


In [None]:
#Preprocessing data on selected columns

# Load spacy
spacy_token = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Preprocessing function
def preprocess_text(text):
    if pd.isna(text):  # Check missing values
        return " "
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply preprocessing for Products and Categories column
sales['Processed_Products'] = sales['Products'].apply(preprocess_text)
sales['Processed_Categories'] = sales['Categories'].apply(preprocess_text)

# Normalize numerical column
sales['Normalized_PurchaseQuantity'] = (sales['PurchaseQuantity'] - sales['PurchaseQuantity'].min()) / (sales['PurchaseQuantity'].max() - sales['PurchaseQuantity'].min())

# Display results
sales[['Customer-ID', 'Processed_Products','Processed_Categories', 'Normalized_PurchaseQuantity']].head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['Processed_Products'] = sales['Products'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['Processed_Categories'] = sales['Categories'].apply(preprocess_text)


Unnamed: 0,Customer-ID,Processed_Products,Processed_Categories,Normalized_PurchaseQuantity
0,CUSTIDXX-1,pork,meat and seafood,0.375
1,CUSTIDXX-2,frozen meals,frozen foods,0.25
2,CUSTIDXX-3,nuts,snacks and beverages,0.875
3,CUSTIDXX-5,action figure,toys and games,0.5
4,CUSTIDXX-7,middle eastern hummus,ethnic foods,0.833333


In [None]:
#Feature selection for Collaborative Filtering
#User-item interaction

# Convert categorical values to numerical IDs
sales['User_ID'] = sales['Customer-ID'].astype("category").cat.codes
sales['Item_ID'] = sales['Processed_Categories'].astype("category").cat.codes

# Create user-item interaction matrix (Quantity as interaction score)
user_item_matrix = sales.pivot(index="User_ID", columns="Item_ID", values="PurchaseQuantity").fillna(0)

# Convert to sparse matrix
interaction_sparse = csr_matrix(user_item_matrix)

# Ensure consistent shape (binary interactions)
interaction_sparse[interaction_sparse > 0] = 1


In [None]:
#Split to train and test data
train, test = random_train_test_split(interaction_sparse, test_percentage=0.2, random_state=42)

#Train lightFM model (tuned parameters)
lightfm_model = LightFM(loss='warp', no_components=70, learning_rate=0.035)
lightfm_model.fit(train, epochs=120, num_threads=4)

<lightfm.lightfm.LightFM at 0x7b26d8db1390>

In [None]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)


Train shape: (49230, 30)
Test shape: (49230, 30)


In [None]:
#Print evaluation matrix and accuracy results (LightFM)
lfm_precision = precision_at_k(lightfm_model, test, k=5, train_interactions=train).mean()
lfm_recall = recall_at_k(lightfm_model, test, k=7, train_interactions=train).mean()
print("LightFM Precision:", lfm_precision) #recommendation accuracy (balanced)
print("LightFM Recall:", lfm_recall) #retrieve relevant items for users (top 7-8)

LightFM Precision: 0.033719275
LightFM Recall: 0.23085516961202518


In [None]:
#Prepare Data using surprise library for Matrix Factorisation (SVD)
reader = Reader(rating_scale=(sales['PurchaseQuantity'].min(), sales['PurchaseQuantity'].max()))
surprise_data = Dataset.load_from_df(sales[['User_ID', 'Item_ID', 'PurchaseQuantity']], reader)

In [None]:
#Split into train and test data for SVD data
trainset, testset = surprise_train_test_split(surprise_data, test_size=0.2, random_state=42)

# Train SVD Model (Improved parameters)
svd_model = SVD(n_factors=100, reg_all=0.05, biased=False)
svd_model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7b26d6fde690>

In [None]:
#Test SVD model
predictions = svd_model.test(testset)
print(predictions[:5])

#Evaluation metrics on SVD
from surprise import accuracy
rmse = accuracy.rmse(predictions)
print(f"SVD Model RMSE: {rmse:.4f}")

[Prediction(uid=3385, iid=9, r_ui=11.0, est=12.96295449928905, details={'was_impossible': True, 'reason': 'User and item are unknown.'}), Prediction(uid=12432, iid=20, r_ui=15.0, est=12.96295449928905, details={'was_impossible': True, 'reason': 'User and item are unknown.'}), Prediction(uid=42802, iid=11, r_ui=13.0, est=12.96295449928905, details={'was_impossible': True, 'reason': 'User and item are unknown.'}), Prediction(uid=16540, iid=13, r_ui=21.0, est=12.96295449928905, details={'was_impossible': True, 'reason': 'User and item are unknown.'}), Prediction(uid=38379, iid=15, r_ui=19.0, est=12.96295449928905, details={'was_impossible': True, 'reason': 'User and item are unknown.'})]
RMSE: 7.2182
SVD Model RMSE: 7.2182


In [None]:
#Hypertune SVD
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [50, 100, 150],
    'reg_all': [0.01, 0.02, 0.05]
}

grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(surprise_data)

print(f"Best RMSE: {grid_search.best_score['rmse']}")
print(f"Best Params: {grid_search.best_params['rmse']}")

Best RMSE: 7.2160741498964525
Best Params: {'n_factors': 150, 'reg_all': 0.05}


In [None]:
#Retrain for hybrid combination
#SVD model
reader = Reader(rating_scale=(sales['PurchaseQuantity'].min(), sales['PurchaseQuantity'].max()))
surprise_data = Dataset.load_from_df(sales[['User_ID', 'Item_ID', 'PurchaseQuantity']], reader)
trainset = surprise_data.build_full_trainset()
testset = trainset.build_testset()
#training
svd_model = SVD(n_factors=100, reg_all=0.05)
svd_model.fit(trainset)

#LightFM model with index correction
from lightfm.data import Dataset as LFM_Dataset
lfm_dataset = LFM_Dataset()
lfm_dataset.fit(sales['User_ID'].unique(), sales['Item_ID'].unique())
(interactions, _) = lfm_dataset.build_interactions([(u, i) for u, i in zip(sales['User_ID'], sales['Item_ID'])])
#training
lightfm_model = LightFM(loss='warp')
lightfm_model.fit(interactions, epochs=10, num_threads=4)

#Map to LightFM indices
user_id_map = {user: i for i, user in enumerate(sales['User_ID'].unique())}
item_id_map = {item: i for i, item in enumerate(sales['Item_ID'].unique())}

In [None]:
# Hybrid Model Combination
# Combine both SVD and LightFM predictions (weighted average)
def hybrid_prediction(user, item, alpha=0.5):
    svd_score = svd_model.predict(user, item).est
    # Ensure valid ranges
    user_idx = user_id_map.get(user)
    item_idx = item_id_map.get(item)
    if user_idx is not None and item_idx is not None:
        lightfm_score = lightfm_model.predict(np.array([user_idx]), np.array([item_idx]))[0]
    else:
        lightfm_score = svd_score
    return alpha * svd_score + (1 - alpha) * lightfm_score

In [None]:
#Final hybrid model evaluation
# Prepare test set
test_data = [(uid, iid, true_r) for (uid, iid, true_r) in testset]

# Generate hybrid model predictions
y_true = []
y_pred = []

for uid, iid, true_r in test_data:
    if uid in user_id_map and iid in item_id_map:  # Ensure index mapping exists
        pred = hybrid_prediction(uid, iid)
        y_true.append(true_r)
        y_pred.append(pred)

# Calculate RMSE & MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)

print(f"Hybrid Model RMSE: {rmse:.4f}")
print(f"Hybrid Model MAE: {mae:.4f}")

Hybrid Model RMSE: 6.2843
Hybrid Model MAE: 5.1979


In [None]:
!pip install optuna #install before hypertuning

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
# Hypertuning using Optuna
import optuna
def objective(trial):
    n_factors = trial.suggest_int('n_factors', 50, 150)
    reg_all = trial.suggest_float('reg_all', 0.02, 0.1)
    epochs = trial.suggest_int('epochs', 10, 30)
    alpha = trial.suggest_float('alpha', 0.1, 0.9)

    # Train SVD Model
    svd_model = SVD(n_factors=n_factors, reg_all=reg_all)
    svd_model.fit(trainset)

    # Train LightFM Model
    lightfm_model = LightFM(loss='warp')
    lightfm_model.fit(interactions, epochs=epochs, num_threads=4)

    # Hybrid Predictions
    y_true, y_pred = [], []
    for uid, iid, actual in test_data:
        if uid in user_id_map and iid in item_id_map:
            svd_pred = svd_model.predict(uid, iid).est
            lfm_pred = lightfm_model.predict(np.array([user_id_map[uid]]), np.array([item_id_map[iid]]))[0]
            hybrid_pred = alpha * svd_pred + (1 - alpha) * lfm_pred

            y_true.append(actual)
            y_pred.append(hybrid_pred)

    # RMSE Calculation
    if len(y_true) > 0:
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    else:
        rmse = float("inf")

    return rmse

# Run Optimization (10 Trials for Speed, Increase for Better Results)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

# Best Parameters
print(f"Best RMSE: {study.best_value:.4f}")
print(f"Best Parameters: {study.best_params}")

[I 2025-04-19 04:04:38,326] A new study created in memory with name: no-name-117106c1-f119-4e59-baf3-bc561467712c
[I 2025-04-19 04:05:07,045] Trial 0 finished with value: 9.808307105155267 and parameters: {'n_factors': 89, 'reg_all': 0.0852193715971729, 'epochs': 16, 'alpha': 0.20781459711781425}. Best is trial 0 with value: 9.808307105155267.
[I 2025-04-19 04:05:18,793] Trial 1 finished with value: 2.6661850324711125 and parameters: {'n_factors': 110, 'reg_all': 0.05708858731521438, 'epochs': 11, 'alpha': 0.7989384741403744}. Best is trial 1 with value: 2.6661850324711125.
[I 2025-04-19 04:05:30,556] Trial 2 finished with value: 8.477605801558 and parameters: {'n_factors': 60, 'reg_all': 0.05884465148540258, 'epochs': 14, 'alpha': 0.3170761267323169}. Best is trial 1 with value: 2.6661850324711125.
[I 2025-04-19 04:05:55,863] Trial 3 finished with value: 7.811414505806371 and parameters: {'n_factors': 132, 'reg_all': 0.06936766503254933, 'epochs': 16, 'alpha': 0.36997010563007693}. Be

Best RMSE: 2.1070
Best Parameters: {'n_factors': 55, 'reg_all': 0.09924123213609178, 'epochs': 20, 'alpha': 0.8517059785458897}


In [None]:
#Final training using hyperparameters
# Apply Best Parameters identified
best_params = {'n_factors': 148, 'reg_all': 0.06100388024867412, 'epochs': 28, 'alpha': 0.8171559921836541}

# Train SVD
final_svd = SVD(n_factors=best_params['n_factors'], reg_all=best_params['reg_all'])
final_svd.fit(trainset)

# Train LightFM
final_lightfm = LightFM(loss='warp')
final_lightfm.fit(interactions, epochs=best_params['epochs'], num_threads=4)

# Hybrid Predictions
final_y_true, final_y_pred = [], []
for uid, iid, actual in test_data:
    if uid in user_id_map and iid in item_id_map:
        svd_pred = final_svd.predict(uid, iid).est
        lfm_pred = lightfm_model.predict(np.array([user_id_map[uid]]), np.array([item_id_map[iid]]))[0]
        hybrid_pred = best_params['alpha'] * svd_pred + (1 - best_params['alpha']) * lfm_pred

        final_y_true.append(actual)
        final_y_pred.append(hybrid_pred)

# Final RMSE
final_rmse = np.sqrt(mean_squared_error(final_y_true, final_y_pred))
print(f"Final Hybrid Model RMSE: {final_rmse:.4f}")

Final Hybrid Model RMSE: 2.4508


In [None]:
#Testing on recommendation for random user (User-item interaction)
import random

# Map Item_ID to Product_Name
item_id_to_name = sales[['Item_ID', 'Products']].drop_duplicates().set_index('Item_ID')['Products'].to_dict()

# Function to generate random user and get recommendations
def recommend_items_random(n=5):
    user = random.choice(list(user_id_map.keys()))  # Random user
    print(f"Generating recommendations for Random User: {user}")

    user_idx = user_id_map[user]
    item_scores = {}
    y_true = []
    y_pred = []

    for item in item_id_map.keys():
        item_idx = item_id_map[item]

        # Hybrid Prediction
        svd_pred = final_svd.predict(user, item).est
        lfm_pred = lightfm_model.predict(np.array([user_id_map[uid]]), np.array([item_id_map[iid]]))[0]
        hybrid_pred = best_params['alpha'] * svd_pred + (1 - best_params['alpha']) * lfm_pred

        item_scores[item] = hybrid_pred

        # Get actual value from dataset
        actual_value = sales.loc[(sales['User_ID'] == user) & (sales['Item_ID'] == item), 'PurchaseQuantity']

        if not actual_value.empty:
            y_true.append(actual_value.values[0])  # Append actual value
            y_pred.append(hybrid_pred)  # Append predicted value

    # Get Top-N Recommendations
    recommended_items = sorted(item_scores, key=item_scores.get, reverse=True)[:n]

    print(f"Top {n} Recommended Items for Random User {user}: {recommended_items}")

    # Compute RMSE & MAE for accuracy
    if y_true and y_pred:
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)
        print(f"Recommendation RMSE: {rmse:.4f}")
        print(f"Recommendation MAE: {mae:.4f}")
    else:
        print("No actual purchase data available for accuracy calculation.")

    return user, recommended_items


In [None]:
#Recommend based on purchase history
def recommend_product_names_for_user(user_id_original, top_n=5):
    if user_id_original not in user_id_map:
        print("User ID not found in mapping.")
        return []

    user_idx = user_id_map[user_id_original]
    known_items = set(sales[sales['User_ID'] == user_id_original]['Item_ID'].tolist())
    all_items = set(sales['Item_ID'].unique())
    unknown_items = list(all_items - known_items)

    recommendations = []
    actuals = []
    predictions = []

    # Get max purchase quantity (for normalization)
    max_qty = sales['PurchaseQuantity'].max()

    for item in unknown_items:
        if item in item_id_map:
            purchase_qty = sales[(sales['User_ID'] == user_id_original) & (sales['Item_ID'] == item)]['PurchaseQuantity'].sum()

            # Normalize quantity using log scale (or use min-max)
            normalized_qty = np.log1p(purchase_qty)  # log1p handles 0 safely

            # Get predictions from both models
            svd_pred = final_svd.predict(user_id_original, item).est
            lfm_pred = final_lightfm.predict(np.array([user_id_map[user_id_original]]), np.array([item_id_map[item]]))[0]

            # Combine SVD and LFM with tuned alpha
            hybrid_pred = best_params['alpha'] * svd_pred + (1 - best_params['alpha']) * lfm_pred

            # Combine with normalized purchase qty (use weight to avoid overpowering)
            final_score = 0.8 * hybrid_pred + 0.2 * normalized_qty

            recommendations.append((item, final_score))
            actuals.append(purchase_qty)
            predictions.append(final_score)

    # Normalize scores between 1 and 5
    scores_only = [score for _, score in recommendations]
    min_score, max_score = min(scores_only), max(scores_only)
    normalized_recommendations = [
        (item, ((score - min_score) / (max_score - min_score)) * 4 + 1)
        for item, score in recommendations
    ]

    # Sort and select top
    normalized_recommendations.sort(key=lambda x: x[1], reverse=True)
    top_items = normalized_recommendations[:top_n]

    # Map to product names
    result = [(item, score, item_id_to_name.get(item, "Unknown Product")) for item, score in top_items]

    # RMSE
    if actuals and predictions:
        rmse = np.sqrt(np.mean([(a - p) ** 2 for a, p in zip(actuals, predictions)]))
        print(f"\nImproved RMSE for Purchase History-based Recommendation: {rmse:.4f}")

    return result

In [None]:
# Get random user and generate recommendations
random_user, top_items = recommend_items_random(n=5)

# Use the random user for product recommendations
product_recommendations = recommend_product_names_for_user(random_user, top_n=5)

# Output recommended products
print(f"\nTop 5 product recommendations for Random User {random_user}:")
for idx, (item_id, score, product_name) in enumerate(product_recommendations, 1):
    print(f"{idx}. {product_name} (Item ID: {item_id}) - Score: {score:.4f}")

Generating recommendations for Random User: 6671
Top 5 Recommended Items for Random User 6671: [10, 5, 8, 11, 9]
Recommendation RMSE: 0.8643
Recommendation MAE: 0.8643

Improved RMSE for Purchase History-based Recommendation: 8.3826

Top 5 product recommendations for Random User 6671:
1. Whipped Cream (Item ID: 10) - Score: 5.0000
2.     Soft Drinks (Item ID: 5) - Score: 4.8182
3.     Dessert (Item ID: 11) - Score: 4.6577
4.     Instant Noodles (Item ID: 8) - Score: 4.6550
5.     Soy Sauce (Item ID: 9) - Score: 4.6181


In [None]:
#Save both trained models
import joblib

alpha = 0.5 #equal weights for models

class HybridRecommender:
    def __init__(self, svd_model, lightfm_model, alpha):
        self.svd = svd_model
        self.lightfm = lightfm_model
        self.alpha = alpha
        self.user_map = {}
        self.item_map = {}
        self.default_score = 0

    def update_user_map(self, user_id):
        if user_id not in self.user_map:
            new_index = len(self.user_map)
            self.user_map[user_id] = new_index
            print(f"User '{user_id}' added to user_map with index {new_index}")

    def update_item_map(self, item_id):
        if item_id not in self.item_map:
            new_index = len(self.item_map)
            self.item_map[item_id] = new_index
            print(f"Item '{item_id}' added to item_map with index {new_index}")

    def predict(self, user_id, item_id):
        self.update_user_map(user_id)
        self.update_item_map(item_id)

        user_idx = self.user_map.get(user_id)
        item_idx = self.item_map.get(item_id)

        if item_idx is None:
            print(f"Item '{item_id}' not in training data. Using default score.")
            return self.default_score  # Use default score for unseen items

        if user_idx is None:
            print(f"User '{user_id}' unknown to model. Using default score.")
            svd_pred = self.default_score  # Default score for unknown user
            lfm_pred = self.default_score  # Default score for unknown user
        else:
            try:
                svd_pred = self.svd.predict(user_id, item_id).est
            except Exception as e:
                print(f"SVD failed for user {user_id}, item {item_id}: {e}")
                svd_pred = self.default_score

            try:
                lfm_pred = self.lightfm.predict(np.array([user_idx]), np.array([item_idx]))[0]
            except Exception as e:
                print(f"LightFM failed for user {user_id}, item {item_id}: {e}")
                lfm_pred = self.default_score

        hybrid_score = self.alpha * svd_pred + (1 - self.alpha) * lfm_pred
        print(f"Score for user '{user_id}' and item '{item_id}': {hybrid_score}")
        return hybrid_score


# Create the HybridRecommender model
recommender = HybridRecommender(final_svd, lightfm_model, alpha)

# Save the hybrid recommender model
joblib.dump(recommender, 'hybrid_recommender_model.pkl')


['hybrid_recommender_model.pkl']