# Instacart Recommendation Engine

This Jupyter notebook aims to build a recommendation engine using vector embeddings. The recommendation engine will utilize various features such as product names, shopping cart and user behaviour to generate relevant recommendations for users.

The notebook is organized into the following sections:

0. Flowchart
1. Initial Setup: Loading necessary libraries and dependencies.
2. Data Loading and Pre-processing: Loading and preparing the required data for the recommendation engine.
3. Build Vector Embeddings: Creating vector embeddings for different features based on order carts.
    - 3a. Build Product Embedding by Name: Vector representation for stem word.
    - 3b. Build Product Embedding by Cart: Vector representation for product (id).
    - 3c. Build Product Embedding by Department: Vector representation by Department.
4. Concatenate Embeddings & Generate Representations: Combining the vector embeddings from the previous steps, generate representations for each order/user by averaging all product vectors in the order/ever ordered by user
5. Vector Similarity Search (Output): Using the concatenated embeddings to perform similarity search and generate recommendations.
6. Saving objects




# Flowchart

![title](img/flowchart.svg)

## Initial Setup

In [None]:
!pip install gensim
!pip install annoy
!pip install nltk

In [None]:
import os

import numpy as np
import pandas as pd
from pandas.core.common import flatten

from gensim.models import Word2Vec
import seaborn as sns

from annoy import AnnoyIndex

import nltk
nltk.download('wordnet')

from tqdm import tqdm
import zipfile as zp
from art import *
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import plotly.offline as pyo
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
import plotly.express as px

# from PyDictionary import PyDictionary P
import random
import time

#import scikitplot as skplt

#to enable the inline plotting
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Change to location of data (Instacart Market Basket Analysis Kaggle data)
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')

## Data loading and Pre-processing

In [None]:
# Reading the csv files into corresponding dataframes
# Load products, aisles and departments
products = pd.read_csv(os.path.join(data_dir, "products.csv"))
aisles = pd.read_csv(os.path.join(data_dir, "aisles.csv"))
departments = pd.read_csv(os.path.join(data_dir, "departments.csv"))

# Load orders dataset
orders = pd.read_csv(os.path.join(data_dir, "orders.csv"))
order_products_prior = pd.read_csv(os.path.join(data_dir, "order_products__prior.csv"))
order_products_train = pd.read_csv(os.path.join(data_dir, "order_products__train.csv"))

# Replacing numbers with their corresponding day of week
days_of_week = {0: 'Saturday', 
                1: 'Sunday', 
                2: 'Monday',
                3: 'Tuesday',
                4: 'Wednesday',
                5: 'Thursday',
                6: 'Friday'}
# Define the categories of days of week sorted normally from Saturday to Friday
orders['order_dow'] = orders['order_dow'].replace(to_replace=days_of_week)
orders['order_dow'] = pd.Categorical(orders['order_dow'], 
                                     ordered=True, 
                                     categories=list(days_of_week.values()))

orders['daytime'] = orders['order_dow'].astype('str') + orders['order_hour_of_day'].astype('str')

In [None]:
## Product name stemming and lemmatisation
products['products_mod'] = products['product_name'].str.lower()
# Clean special characters.
products['products_mod'] = products['products_mod'].str.replace('\W', ' ', regex=True)
# Split products into terms: Tokenize.
products['products_mod'] = products['products_mod'].str.split()
# Merge the department and aisle names into the dataframe. 
products = pd.merge(products, departments, on="department_id", how='outer')
products = pd.merge(products, aisles, on="aisle_id", how='outer')
# Remove synonyms here in the list
products['products_mod'] = products[['products_mod', 'aisle', 'department']].values.tolist()
products['products_mod'] = products['products_mod'].apply(lambda x:list(flatten(x)))

lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')
products['products_lemma'] = products['products_mod'].apply(lambda row:[lemma.lemmatize(item) for item in row])
products['products_lemma'] = products['products_lemma'].apply(lambda row:[sno.stem(item) for item in row])

prod_dict = dict(zip(products.product_id, products.product_name))

In [None]:
## Add product information to order_products_prior
order_products_prior = order_products_prior.merge(products[['product_id', 'department_id', 'aisle_id']], on='product_id')
## Group all products for a single order into a list
## This will be later used to generate embeddings
order_baskets = order_products_prior.groupby('order_id')['product_id'].apply(list)

In [None]:
%%time
# Generate department and aisle baskets
department_basket = order_products_prior.groupby('order_id')['department_id'].apply(list)
aisle_basket = order_products_prior.groupby('order_id')['aisle_id'].apply(list)

department_basket_unique = department_basket.apply(lambda x: list(set(x)))
aisle_basket_unique = aisle_basket.apply(lambda x: list(set(x)))

## Word2Vec models

In [None]:
## Configuration

# Vector size for word embeddings
# Justification: A vector size of 8 strikes a balance between capturing semantic information and manageable dimensionality.
WORD_VECTOR_SIZE = 8
# Vector size for product embeddings
# Justification: A vector size of 64 allows for capturing complex relationships and characteristics of the products.
PRODUCT_VECTOR_SIZE = 64
# Vector size for department embeddings
# Justification: A vector size of 2 effectively represents the limited number of department categories with low computational complexity.
DEPARTMENT_VECTOR_SIZE = 2

# CPU Cores
WORKER_COUNT = os.cpu_count()
# Minimum frequency before dropping set to 1 to include each and every product, even if it was purchased once
MIN_COUNT = 1

In [None]:
### Training the `Word2Vec` model based on product lemma

# Defining the maximun window
window_max = max(products['products_lemma'].apply(lambda x:len(x)))

# size=20: In order to make `Word2Vec` a little bit quicker and for memory efficiency we're going to use 20 dimensions.
# window=49: In order to make sure all words are used in training the model, we're going to set a large.
w2vec_model = Word2Vec(list(products['products_lemma']), vector_size=WORD_VECTOR_SIZE, window=window_max,
                       min_count=MIN_COUNT, workers=WORKER_COUNT)

### Vector calculation for products
# Loop through each product and obtain the average of each string that makes a product.
# This will be the vector representation of the product.
# The vector representation of the product will be used to calculate the similarity between products.
# The similarity between products will be used to recommend products to the user.

# Loop through each word in the product name to generate the vector.
prods_w2v = dict()
for row, product in tqdm(products.iterrows()):
    word_vector = list()
    for word in product['products_lemma']:
        word_vector.append(w2vec_model.wv[word])

    prods_w2v[product['product_id']] = np.average(word_vector, axis=0)

# Save vector values in list form to the dataframe.
products['vectors_word'] = prods_w2v.values()

In [None]:
%%time
### Training the `Word2Vec` model based on cart
# products which are ordered together will be closer in vector space

# Define maximum window for longest order cart
window_max = order_baskets.apply(len).max()

# w2vec model
w2vec_model = Word2Vec(list(order_baskets), vector_size=PRODUCT_VECTOR_SIZE, window=window_max,
                       min_count=MIN_COUNT, workers=WORKER_COUNT)

# get vectors for each product
products['vectors_product'] = products.product_id.apply(lambda x: w2vec_model.wv[x])

In [None]:
%%time
# # Define maximum window for longest order cart
window_max = department_basket.apply(len).max()

# w2vec model
w2vec_model = Word2Vec(list(department_basket), vector_size=DEPARTMENT_VECTOR_SIZE, window=window_max, 
                       min_count=MIN_COUNT, workers=WORKER_COUNT)

# get vectors for each product
products['vectors_dept'] = products.department_id.apply(lambda x: w2vec_model.wv[x])

## Vector concatenation and order/user representations

In [None]:
## Using `annoy` model to calculate the similarity between products
def annoy_build(df, id, vector_size, metric='euclidean'):
    trees = 10
    m = AnnoyIndex(vector_size, metric=metric) 
    m.set_seed(42)
    for _, row in df.iterrows():
        m.add_item(row[id], row['vectors'])
    m.build(trees)
    return m

In [None]:
## Concatenate all vectors together to form a single representation of product embedding
products['vectors'] = products.apply(lambda x: [*x['vectors_word'], *x['vectors_product'], *x['vectors_dept']], axis=1)

In [None]:
%%time
### Train `annoy` for `product` dataset
### Annoy object can be used to identify similar products for a given product embedding
p = annoy_build(products, 'product_id', len(products['vectors'][0]))

In [None]:
%%time
### Train `annoy` for `orders` dataset
### Annoy object can be used to identify similar orders for a given embedding
order_w2v = dict()
for index, row in tqdm(order_baskets.items()):
    word_vector = list()
    for item_id in row:
        word_vector.append(p.get_item_vector(item_id))
    order_w2v[index] = np.average(word_vector, axis=0)

df_order_baskets = pd.DataFrame({'order_id': order_baskets.index, 'product_id': order_baskets.values})
df_order_baskets['vectors'] = order_w2v.values()

# Specify the metric to be used for computing distances. 
b = annoy_build(df_order_baskets, 'order_id', len(df_order_baskets['vectors'][0]))

In [None]:
%%time
def func_avg(x):
    return np.average(x, axis=0)

### Train `annoy` for `orders` dataset
### Annoy object can be used to identify similar users for a given embedding

user_basket = pd.merge(df_order_baskets, orders, on="order_id", how='inner')

df_user_basket = user_basket[['user_id', 'vectors', 'product_id']]
df_user_basket = df_user_basket.groupby('user_id').agg(list)
df_user_basket['vectors'] = df_user_basket['vectors'].agg(func_avg).apply(tuple)
df_user_basket['product_id'] = df_user_basket['product_id'].agg(lambda x: list(set([item for sublist in x for item in sublist])))
df_user_basket = df_user_basket.reset_index()

# Specify the metric to be used for computing distances. 
u = annoy_build(df_user_basket, 'user_id', len(df_user_basket.vectors[0]))

In [None]:
# %%time 
# ### Train `annoy` for `daytime` data
# daytime_basket = pd.merge(df_order_baskets, orders, on='order_id', how='inner')
# daytime_basket = daytime_basket.groupby('daytime').apply(lambda x: [list(x['vectors']), list(x['product_id'])]).apply(pd.Series)
# daytime_basket.columns =['vectors','product_id']
# daytime_basket['vectors'] = daytime_basket['vectors'].apply(lambda x: tuple(np.average(x, axis=0)))
# daytime_basket['product_id'] = daytime_basket['product_id'].apply(lambda x: [item for sublist in x for item in sublist])
# daytime_basket['product_id'] = daytime_basket['product_id'].apply(lambda x: list(set(x)))
# df_daytime_basket = daytime_basket.reset_index().reset_index().rename(columns={'index':'daytime_id'})
# # Specify the metric to be used for computing distances. 
# d = annoy_build(df_daytime_basket, 'daytime_id', len(df_daytime_basket.vectors[0]))

## Testing Embeddings

In [None]:
# Inputs can be of multiple types
# 1. Only products input - Get centroid vector of products, then find carts that are similar and rank products
# 2. Only user_ids input - Get centroid vector of user_ids, then find carts that are similar and rank products
# 3. Both products and user_ids as input - Get products and user centroid, then find products by product and dot product with user centroid to find best products
# 4. No input - Find products based on daytime

In [None]:
product_list = [47136, 2529, 8990]
user_list = [1,2,3]
daytime = 127

In [None]:
def rank_by_euclidean(df, vector):
    df['distance'] = df['vectors'].apply(lambda x: np.linalg.norm(x - vector))
    df = df.sort_values('distance', ascending=False)
    return df

def rank_by_dot_product(df, vector):
    df['dot_prod'] = df['vectors'].apply(lambda x: np.dot(x, vector))
    df = df.sort_values('dot_prod', ascending=False)
    return df

def compose_basket_by_cart(product_vector, input = None, n_items = 15, method='euclidean', n_neighbours = 100):
    order_list = b.get_nns_by_vector(product_vector, n_items)
    fpl = []
    for order in order_list:
        fpl = fpl + order_baskets[order]
    
    fpl = list(set(fpl))

    sel_df = pd.DataFrame({"product_id": fpl}).merge(products, on='product_id', how='inner')
    if method == 'euclidean':
        sel_df = rank_by_euclidean(sel_df, product_vector)
    else:  
        sel_df = dot_prod_rank(sel_df, product_vector)
    if input is not None:
        sel_df = sel_df[~sel_df.product_id.isin(input)]
    sel_df = sel_df[['product_name', 'department', 'aisle']].head(n_items).reset_index(drop=True)
    return sel_df

In [None]:
def get_centroid(vector_list):
    return np.average(vector_list, axis=0)

def get_centroid_by_annoy_obj(obj_ann, obj_list):
    w2v_list = []
    for obj_id in obj_list:
        w2v_list.append(obj_ann.get_item_vector(obj_id))
    return get_centroid(w2v_list)

def get_basket_by_product_list(product_list, n_items = 15):
    selected = products[products.product_id.isin(product_list)]
    prod_vector = get_centroid(selected.vectors.tolist())
    return compose_basket_by_cart(prod_vector, input=selected.product_name.tolist(), n_items=n_items)

def get_basket_by_user_list(user_list, n_items = 15):
    user_vector = get_centroid_by_annoy_obj(u, user_list)
    return compose_basket_by_cart(user_vector, n_items=n_items)

def get_basket_by_user_product(product_list, user_list, n_items = 15):
    # Get product vector
    selected = products[products.product_id.isin(product_list)]
    prod_vector = get_centroid(selected.vectors.tolist())
    # Get 1000 nearest products
    similar_prod_1000 = p.get_nns_by_vector(prod_vector, 1000)
    sel_df = pd.DataFrame({"product_id": similar_prod_1000}).merge(products, on='product_id', how='inner')

    # Get user vector
    user_vector = get_centroid_by_annoy_obj(u, user_list)
    # Rank 1000 products by aggregated user vector
    sel_df = rank_by_euclidean(sel_df, user_vector)
    # Remove input products
    sel_df = sel_df[~sel_df.product_id.isin(selected.product_name.tolist())]
    # Return top n
    sel_df = sel_df[['product_name', 'department', 'aisle']].head(n_items).reset_index(drop=True)
    return sel_df

def get_basket_by_daytime(daytime_id):
    pass

In [None]:
get_basket_by_user_product(product_list, user_list)

In [None]:
get_basket_by_product_list(product_list)

In [None]:
get_basket_by_user_list(user_list)

## Save objects

In [None]:
%%time
### This section will save resources
### These resources can later be used by an app to run the engine

save_path = os.path.join(os.path.dirname(os.getcwd()), 'res')

def save_annoy(obj, n):
    path = os.path.join(save_path, n + ".ann")
    obj.save(path)

## Save annoy objects
save_annoy(p, "product")
save_annoy(u, "user")
save_annoy(b, "basket")
#save_annoy(d, "daytime")

In [None]:
%%time
### Save dataframes to avoid pre-processing
def save_dataframe(obj, n):
    path = os.path.join(save_path, n + ".pkl")
    obj.to_pickle(path)
    print(path, "saved !")

save_dataframe(products, 'products')
save_dataframe(order_baskets, 'order_baskets')
#save_dataframe(df_daytime_basket, 'df_daytime_basket')