LightFM

In [1]:
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
import numpy as np
import pandas as pd
from lightfm.cross_validation import random_train_test_split
import pickle
import json
from joblib import load



In [2]:
data = pd.read_csv("shopping_behavior_updated.csv")
data['Customer ID'] = data['Customer ID'].astype(str)

In [3]:
# Create age group
data['Age Group'] = pd.cut(data['Age'], bins=[0, 25, 35, 50, 100], labels=['<25', '25-35', '35-50', '>50'])
# Review Rating -> Postive or Negative
data['weighted_liked'] = data['Review Rating'].apply(lambda x: 3.0 if x >= 4.0 else 0.5)

In [6]:
#Create feature for user and item
user_features = [f"Age Group:{group}" for group in data['Age Group'].unique()] + \
                [f"Gender:{gender}" for gender in data['Gender'].unique()] + \
                [f"Previous Purchases:{purchases}" for purchases in data['Previous Purchases'].unique()]  # Thêm Previous Purchases

item_features = list(data['Category'].unique()) + \
                list(data['Season'].unique()) + \
                list(data['Color'].unique())


In [7]:
# Dataset
dataset = Dataset()
dataset.fit(
    data['Customer ID'], 
    data['Item Purchased'], 
    user_features=user_features, 
    item_features=item_features
)

In [8]:
(interactions_matrix, _) = dataset.build_interactions(
    [(x['Customer ID'], x['Item Purchased'], x['weighted_liked']) for _, x in data.iterrows()]
)

user_features_matrix = dataset.build_user_features(
    [(x['Customer ID'], [f"Age Group:{x['Age Group']}", f"Gender:{x['Gender']}", 
                         f"Previous Purchases:{x['Previous Purchases']}"])  # Thêm Previous Purchases
     for _, x in data.iterrows()]
)

item_features_matrix = dataset.build_item_features(
    [(x['Item Purchased'], [x['Category'], x['Season'], x['Color']]) 
     for _, x in data.iterrows()]
)

In [9]:
train_interactions, test_interactions = random_train_test_split(interactions_matrix, test_percentage=0.1, random_state=np.random.RandomState(42))

In [None]:
# import pickle

# best_auc = 0
# best_config = None
# best_model = None

# for loss in ['warp', 'bpr', 'logistic', 'warp-kos']:
#     for components in [50, 100, 200, 300]:
#         for epochs in [10, 30, 50]:
#             # Initialize the model
#             model = LightFM(loss=loss, no_components=components, random_state=42)
            
#             # Train the model
#             model.fit(train_interactions, 
#                       user_features=user_features_matrix, 
#                       item_features=item_features_matrix, 
#                       epochs=epochs, 
#                       num_threads=4)
            
#             # Calculate AUC
#             test_auc = auc_score(model, interactions_matrix, 
#                                  user_features=user_features_matrix, 
#                                  item_features=item_features_matrix, 
#                                  num_threads=4).mean()
            
#             print(f"Loss: {loss}, Components: {components}, Epochs: {epochs}, AUC: {test_auc:.4f}")
            
#             # Update the best model if this configuration performs better
#             if test_auc > best_auc:
#                 best_auc = test_auc
#                 best_config = (loss, components, epochs)
#                 best_model = model  # Save the best model

# # Display the best configuration and AUC
# print(f"Best configuration: Loss = {best_config[0]}, Components = {best_config[1]}, Epochs = {best_config[2]}")
# print(f"Highest AUC: {best_auc:.4f}")

# from joblib import dump

# # Assuming `best_model` is your trained LightFM model
# dump(best_model, 'best_lightfm_model.joblib')
# print("Model saved successfully with joblib.")


Loss: warp, Components: 50, Epochs: 10, AUC: 0.5433
Loss: warp, Components: 50, Epochs: 30, AUC: 0.8192
Loss: warp, Components: 50, Epochs: 50, AUC: 0.8206
Loss: warp, Components: 100, Epochs: 10, AUC: 0.5263
Loss: warp, Components: 100, Epochs: 30, AUC: 0.8204
Loss: warp, Components: 100, Epochs: 50, AUC: 0.8212
Loss: warp, Components: 200, Epochs: 10, AUC: 0.5379
Loss: warp, Components: 200, Epochs: 30, AUC: 0.8206
Loss: warp, Components: 200, Epochs: 50, AUC: 0.8211
Loss: warp, Components: 300, Epochs: 10, AUC: 0.5292
Loss: warp, Components: 300, Epochs: 30, AUC: 0.8213
Loss: warp, Components: 300, Epochs: 50, AUC: 0.8218
Loss: bpr, Components: 50, Epochs: 10, AUC: 0.4998
Loss: bpr, Components: 50, Epochs: 30, AUC: 0.5179
Loss: bpr, Components: 50, Epochs: 50, AUC: 0.5734
Loss: bpr, Components: 100, Epochs: 10, AUC: 0.4988
Loss: bpr, Components: 100, Epochs: 30, AUC: 0.5254
Loss: bpr, Components: 100, Epochs: 50, AUC: 0.5637
Loss: bpr, Components: 200, Epochs: 10, AUC: 0.5038
Loss: 

In [10]:
# Load the saved model
model = load('best_lightfm_model.joblib')
print("Model loaded successfully with joblib!")

Model loaded successfully with joblib!


Predict

In [11]:
def recommend_similar_items_with_scores(clicked_product, customer_id, model, dataset, user_features, item_features, data, top_n=10):
    # Get the mapped index of the clicked product
    item_index = dataset.mapping()[2][clicked_product]
    
    # Get the mapped index of the customer
    user_index = dataset.mapping()[0][customer_id]
    
    # Predict scores for all items based on the customer and product
    item_ids = np.arange(len(dataset.mapping()[2]))  # List of all product indices
    scores = model.predict(
        user_ids=user_index,
        item_ids=item_ids,
        user_features=user_features,
        item_features=item_features
    )
    
    # Find items similar to the clicked product using embeddings
    _, item_embeddings = model.get_item_representations(item_features)
    similarities = np.dot(item_embeddings, item_embeddings[item_index])
    
    # Filter and sort items by descending similarity
    item_mapping = {v: k for k, v in dataset.mapping()[2].items()}
    scored_items = [
        (item_mapping[i], similarities[i], scores[i]) 
        for i in np.argsort(-similarities) if item_mapping[i] != clicked_product
    ][:top_n]
    
    # Add additional product information
    recommendations = [
        (item, data[data['Item Purchased'] == item]['Category'].values[0], similarity, score)
        for item, similarity, score in scored_items
    ]
    return recommendations


In [12]:
def recommend_similar_users_with_scores(customer_id, model, dataset, user_features, item_features, data, top_n=10):
    # Get the mapped index of the customer
    user_index = dataset.mapping()[0][customer_id]
    
    # Retrieve user embeddings
    _, user_embeddings = model.get_user_representations(user_features)
    similarities = np.dot(user_embeddings, user_embeddings[user_index])
    
    # Find the most similar users
    user_mapping = {v: k for k, v in dataset.mapping()[0].items()}
    top_users = [
        user_mapping[i] for i in np.argsort(-similarities) 
        if user_mapping[i] != customer_id
    ][:top_n]
    
    # Predict scores for all items for the target user
    item_ids = np.arange(len(dataset.mapping()[2]))
    scores = model.predict(
        user_ids=user_index,
        item_ids=item_ids,
        user_features=user_features,
        item_features=item_features
    ) 
    
    # Recommend items based on similar users
    item_mapping = {v: k for k, v in dataset.mapping()[2].items()}
    recommendations = []
    for similar_user in top_users:
        similar_user_index = dataset.mapping()[0][similar_user]
        similar_scores = model.predict(
            user_ids=similar_user_index,
            item_ids=item_ids,
            user_features=user_features,
            item_features=item_features
        )
        recommended_items = [
            (item_mapping[i], scores[i]) for i in np.argsort(-similar_scores)
        ][:top_n]
        recommendations.extend(recommended_items)
    
    # Remove duplicates and sort by score
    recommendations = list(set(recommendations))
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return recommendations[:top_n]


In [13]:
def combined_recommendations(clicked_product, customer_id, model, dataset, user_features, item_features, data, top_n=10):
    # Generate Item-based recommendations
    item_recommendations = recommend_similar_items_with_scores(
        clicked_product, customer_id, model, dataset, user_features, item_features, data, top_n=top_n
    )
    
    # Convert Item-based recommendations into a dictionary {item: (category, similarity, score)}
    item_recommend_dict = {
        item: (data[data['Item Purchased'] == item]['Category'].values[0], similarity, score)
        for item, _, similarity, score in item_recommendations
    }
    
    # Generate User-based recommendations
    user_recommendations = recommend_similar_users_with_scores(
        customer_id, model, dataset, user_features, item_features, data, top_n=top_n
    )
    
    # Merge User-based recommendations into the dictionary
    for item, score in user_recommendations:
        category = data[data['Item Purchased'] == item]['Category'].values[0]
        if item in item_recommend_dict:
            old_similarity = item_recommend_dict[item][1] if item_recommend_dict[item][1] is not None else 0
            old_score = item_recommend_dict[item][2]
            new_similarity = (old_similarity + 0) / 2 if old_similarity else None
            avg_score = (old_score + score) / 2
            item_recommend_dict[item] = (category, new_similarity, avg_score)
        else:
            item_recommend_dict[item] = (category, None, score) 
    
    # Shift similarity and score values to ensure positivity
    all_similarities = [sim for _, (_, sim, _) in item_recommend_dict.items() if sim is not None]
    all_scores = [score for _, (_, _, score) in item_recommend_dict.items()]
    
    min_similarity = min(all_similarities) if all_similarities else 0
    min_score = min(all_scores)
    
    for item in item_recommend_dict:
        category, similarity, score = item_recommend_dict[item]
        similarity = similarity - min_similarity if similarity is not None else None
        score = score - min_score
        item_recommend_dict[item] = (category, similarity, score)
    
    # Sort items by the sum of (similarity + score) in descending order
    sorted_recommendations = sorted(
        item_recommend_dict.items(),
        key=lambda x: (x[1][1] or 0) + x[1][2],  # (similarity + score)
        reverse=True
    )
    
    # Convert to a list of unique final recommendations
    unique_items = set()  
    final_recommendations = []

    for item, (category, similarity, score) in sorted_recommendations:
        if item not in unique_items:  
            unique_items.add(item) 
            final_recommendations.append((item, category, similarity, score, (similarity or 0) + score))
            if len(final_recommendations) >= top_n: 
                break
    
    return final_recommendations


In [14]:
# Input the clicked product and customer ID
clicked_product = input("Enter the product you clicked on (clicked product): ")
customer_id = input("Enter the customer ID: ")

# Generate combined recommendations
combined_recommendation_list = combined_recommendations(
    clicked_product, customer_id, model, dataset, user_features_matrix, item_features_matrix, data, top_n=10
)

# Display the result
print(f"\nProduct '{clicked_product}' is clicked.")
print("\nSimilar products:")

# Print only the product names in the recommendation list
for item, *_ in combined_recommendation_list:
    print(item)



Product 'Sunglasses' is clicked.

Similar products:
Hat
Jewelry
Belt
Handbag
Scarf
Backpack
Gloves
Jacket
Coat
Skirt


SVD

In [3]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


In [5]:

# Normalize the 'rating' column to [0, 1] using MinMaxScaler
scaler = MinMaxScaler()
data['normalized_rating'] = scaler.fit_transform(data[['rating']])


# Build the Surprise Dataset using the normalized rating
reader = Reader(rating_scale=(0, 1))  # Adjust the scale to match normalized ratings
dataset = Dataset.load_from_df(data[["userID", "itemID", "normalized_rating"]], reader)


In [6]:

# Split into train and test sets
trainset, testset = train_test_split(dataset, test_size=0.1, random_state=42)

# Create and train the SVD model
svd = SVD(n_factors=200, n_epochs=30, random_state=42, verbose=True)
svd.fit(trainset)

# Make predictions on the test set
predictions = svd.test(testset)

# Function to calculate AUC
def auc_score(predictions, threshold=0.8):
    # Separate actual and predicted values
    y_true = [1 if pred.r_ui >= threshold else 0 for pred in predictions]
    y_scores = [pred.est for pred in predictions]
    
    # Calculate AUC
    auc = roc_auc_score(y_true, y_scores)
    return auc

# Calculate AUC
auc = auc_score(predictions, threshold=0.8)  # Threshold corresponds to ~4.0 in the original scale
print(f"AUC: {auc:.4f}")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
AUC: 0.5095
