In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score
from surprise import Dataset, Reader
from surprise import SVD, accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split

In [2]:
# Load datasets
customer_data = pd.read_csv("/Users/kuriankgeorge/Desktop/aws/Capstone/08_recommender/data/Recommnder/Customer.csv")
prod_cat_info = pd.read_csv("/Users/kuriankgeorge/Desktop/aws/Capstone/08_recommender/data/Recommnder/prod_cat_info.csv")
transactions = pd.read_csv("/Users/kuriankgeorge/Desktop/aws/Capstone/08_recommender/data/Recommnder/Transactions.csv")

In [3]:
# Data Preprocessing
# Merge the datasets to get product information in the transactions
data = transactions.merge(prod_cat_info, left_on='prod_cat_code', right_on='prod_cat_code')
data = data.merge(customer_data, left_on='cust_id', right_on='customer_Id')

In [4]:
print(data.head())


   transaction_id  cust_id   tran_date  prod_subcat_code  prod_cat_code  Qty  \
0     80712190438   270351  28-02-2014                 1              1   -5   
1     80712190438   270351  28-02-2014                 1              1   -5   
2     80712190438   270351  28-02-2014                 1              1   -5   
3     80712190438   270351  20-02-2014                 1              1    5   
4     80712190438   270351  20-02-2014                 1              1    5   

   Rate    Tax  total_amt Store_type  prod_cat  prod_sub_cat_code prod_subcat  \
0  -772  405.3    -4265.3     e-Shop  Clothing                  4        Mens   
1  -772  405.3    -4265.3     e-Shop  Clothing                  1       Women   
2  -772  405.3    -4265.3     e-Shop  Clothing                  3        Kids   
3   772  405.3     4265.3     e-Shop  Clothing                  4        Mens   
4   772  405.3     4265.3     e-Shop  Clothing                  1       Women   

   customer_Id         DOB Gende

In [5]:
# Create a User-Item matrix for collaborative filtering
user_item_matrix = data.pivot_table(index='cust_id', columns='prod_subcat', values='Qty', fill_value=0)


In [6]:
# Splitting the data into train and test sets
train_data, test_data = train_test_split(user_item_matrix.values, test_size=0.2, random_state=42)

In [7]:
# Collaborative Filtering using SVD
reader = Reader(rating_scale=(0, user_item_matrix.values.max()))
surprise_data = Dataset.load_from_df(data[['cust_id', 'prod_subcat', 'Qty']], reader)
trainset, testset = surprise_train_test_split(surprise_data, test_size=0.2)

In [8]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x129b17e10>

In [9]:
# Evaluate the model
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 2.0199


2.019885423793758

In [10]:
def recommend_products(user_id, num_recommendations=5):
    user_index = user_item_matrix.index.get_loc(user_id)
    user_ratings = user_item_matrix.iloc[user_index]

    # Get unseen products
    unseen_products = user_ratings[user_ratings == 0]

    # Predict ratings for unseen products
    predicted_ratings = [model.predict(user_id, prod_subcat).est for prod_subcat in unseen_products.index]
    recommendations = pd.DataFrame({'product': unseen_products.index, 'predicted_rating': predicted_ratings})
    
    # Get top N recommendations
    return recommendations.nlargest(num_recommendations, 'predicted_rating')

In [12]:
# Example recommendation for a customer
customer_id_example = 269696 
recommended_products = recommend_products(customer_id_example)
print(f'Recommendations for Customer {customer_id_example}:')
print(recommended_products)

Recommendations for Customer 269696:
        product  predicted_rating
10  Non-Fiction          2.776654
1          Bath          2.762526
8       Kitchen          2.735153
5       Fiction          2.725897
2      Children          2.671067


In [13]:
 #Evaluate the performance using precision, recall, F1-score, and mean average precision

In [15]:
true_labels = []
predicted_labels = []

for uid, iid, true_r, est, _ in predictions:
    true_labels.append(true_r)
    predicted_labels.append(est)

# Convert to numpy arrays for metric calculations
true_labels = np.array(true_labels)
predicted_labels = np.array(predicted_labels)

In [16]:
# Binarize the predictions based on a threshold (e.g., 0.5) for binary classification metrics
threshold = 0.5
predicted_labels_binary = (predicted_labels >= threshold).astype(int)
true_labels_binary = (true_labels > 0).astype(int) 

In [17]:
# Evaluate the performance using precision, recall, F1-score, and mean average precision
precision = precision_score(true_labels_binary, predicted_labels_binary, average='binary')
recall = recall_score(true_labels_binary, predicted_labels_binary, average='binary')
f1 = f1_score(true_labels_binary, predicted_labels_binary, average='binary')
mean_avg_precision = average_precision_score(true_labels_binary, predicted_labels_binary)

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}, Mean Average Precision: {mean_avg_precision:.4f}')

Precision: 0.9202, Recall: 0.9287, F1-score: 0.9245, Mean Average Precision: 0.9193
