In [None]:
# AI-driven product recommendation system to increase cross-selling and upselling

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp

#(a) we chose content-based filtering

df = pd.read_csv('final.csv', delimiter=",", encoding='ISO-8859-1')

# Remove duplicates of (item_name, manufacturing_country, supplier) because such duplicates have high similarity scores that
# causes bias in results in turn causing the recco system to have duplicates in its result as well
df = df.drop_duplicates(subset=['item_name', 'manufacturing_country', 'supplier'],keep='first')

# Convert categorical variables into a set of binary useful for hstack
df['region_supplier'] = df['store_region'] + '_' + df['supplier']

# Apply get_dummies to the combined column
df_dummies = pd.get_dummies(df['region_supplier']) #consider products with similar store region and suppliers as related items.



# Using text-based feature TF-IDF for product descriptions captures semantic similarities between products
descmat = TfidfVectorizer(stop_words='english', max_features=650, max_df=0.95).fit_transform(df['description']) #max_df=0.95 to exclude super common words to optimise memory usage

# Dimension reduction technique before combining to prevent crashing
reduced_dummies = TruncatedSVD(n_components=10).fit_transform(df_dummies.values)

reduceddescmat = TruncatedSVD(n_components=30).fit_transform(descmat)

# Convert both matrices to sparse format because sp.hstack works well with sparse matrices
sparsereduceddescmat = sp.csr_matrix(reduceddescmat)
sparsereduced_dummies = sp.csr_matrix(reduced_dummies)

# Combine all feature matrices
features = sp.hstack([sparsereduceddescmat, sparsereduced_dummies])

# Calculate cosine similarity where each entry of similarity matrix represents the similarity score between items
similarity_matrix = cosine_similarity(features)

# Function to get recommendations based on item index
def recco(item_index, sim_matrix, df, top_n=1):
    sim_scores = list(enumerate(sim_matrix[item_index])) # list of tuples where each tuple contains the index of an item and its similarity score with other items
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) #descending order of similarity scores
    sim_scores = [score for score in sim_scores if score[0] != item_index] #Exclude the item itself
    sim_scores = sim_scores[:top_n]
    item_indices = [i[0] for i in sim_scores]
    return df.iloc[item_indices]

#challenges: 1. big dataset, had to cut down by using dimension reduction with SVD
#            2. recco system gave duplicates at first, so we removed bias by dropping duplicates of (item_name, manufacturing_country, supplier) combi at the start

#(b) To evaluate the impact of personalized recommendations on sales, we can record the % of recommendations
#    that lead to purchases or even monitor long term impacts on customer retention

  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/final.csv', delimiter=",", encoding='ISO-8859-1')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['region_supplier'] = df['store_region'] + '_' + df['supplier']


In [None]:
# reference for the 264 products
df_prod = df['item_name'].reset_index(drop=True)
df_prod.index.name = 'index'


print(df_prod)


index
0            Chinet Comfort Hot Cups with Lids 16oz
1                Chobani Greek Yogurt Variety Pack 
2                      Folgers Classic Roast Coffee
3                         Spunkmeyer Muffin Variety
4      Nature Valley Biscuit Sandwich Variety Pack 
                           ...                     
259                                K Cups Hot Cocoa
260                                Gardettos Snacks
261                         Dr. Pepper - 12 oz cans
262                             Snickers Bars 1.8oz
263                            Tootsie Roll Midgets
Name: item_name, Length: 264, dtype: object


In [None]:
# usage
purchasedproduct='K Cups Hot Cocoa' # set purchasedproduct to whatever you want to find from the ref df_prod as seen from above

idx = df_prod[df_prod == purchasedproduct].index.values[0]


print(f"Item of interest: {df.iloc[idx]['item_name']}\nRecommended product: {recco(idx, similarity_matrix, df)['item_name'].values[0]}")


Item of interest: K Cups Hot Cocoa
Recommended product: Swiss Miss Hot Cocoa Mix 1 oz
