## Load relevant libraries

In [2]:
import os

import numpy as np
import pandas as pd
from pandas.core.common import flatten
from annoy import AnnoyIndex
from gensim.models import Word2Vec
from efficient_apriori import apriori
import seaborn as sns

import nltk
nltk.download('wordnet')

from tqdm import tqdm
import zipfile as zp
from art import *
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import plotly.offline as pyo
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
import plotly.express as px

# from PyDictionary import PyDictionary P
import random
import time

#import scikitplot as skplt

#to enable the inline plotting
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /Users/mrpapa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Data and Pre-processing

In [3]:
# Change to location of data (Instacart Market Basket Analysis Kaggle data)
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')

In [4]:
# Reading the csv files into corresponding dataframes
# Load products, aisles and departments
products = pd.read_csv(os.path.join(data_dir, "products.csv"))
aisles = pd.read_csv(os.path.join(data_dir, "aisles.csv"))
departments = pd.read_csv(os.path.join(data_dir, "departments.csv"))

# Load orders dataset
orders = pd.read_csv(os.path.join(data_dir, "orders.csv"))
order_products_prior = pd.read_csv(os.path.join(data_dir, "order_products__prior.csv"))
order_products_train = pd.read_csv(os.path.join(data_dir, "order_products__train.csv"))

# Replacing numbers with their corresponding day of week
days_of_week = {0: 'Saturday', 
                1: 'Sunday', 
                2: 'Monday',
                3: 'Tuesday',
                4: 'Wednesday',
                5: 'Thursday',
                6: 'Friday'}
# Define the categories of days of week sorted normally from Saturday to Friday
orders['order_dow'] = orders['order_dow'].replace(to_replace=days_of_week)
orders['order_dow'] = pd.Categorical(orders['order_dow'], 
                                     ordered=True, 
                                     categories=list(days_of_week.values()))

orders['daytime'] = orders['order_dow'].astype('str') + orders['order_hour_of_day'].astype('str')

In [5]:
## Product name stemming and lemmatisation
products['products_mod'] = products['product_name'].str.lower()
# Clean special characters.
products['products_mod'] = products['products_mod'].str.replace('\W', ' ', regex=True)
# Split products into terms: Tokenize.
products['products_mod'] = products['products_mod'].str.split()
# Merge the department and aisle names into the dataframe. 
products = pd.merge(products, departments, on="department_id", how='outer')
products = pd.merge(products, aisles, on="aisle_id", how='outer')
# Remove synonyms here in the list
products['products_mod'] = products[['products_mod', 'aisle', 'department']].values.tolist()
products['products_mod'] = products['products_mod'].apply(lambda x:list(flatten(x)))

lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')
products['products_lemma'] = products['products_mod'].apply(lambda row:[lemma.lemmatize(item) for item in row])
products['products_lemma'] = products['products_lemma'].apply(lambda row:[sno.stem(item) for item in row])

In [6]:
order_products_prior = order_products_prior.merge(products[['product_id', 'department_id', 'aisle_id']], on='product_id')

In [7]:
## Group all products for a single order into a list
## This will be later used to generate embeddings
order_baskets = order_products_prior.groupby('order_id')['product_id'].apply(list)

In [8]:
%%time
department_basket = order_products_prior.groupby('order_id')['department_id'].apply(list)
aisle_basket = order_products_prior.groupby('order_id')['aisle_id'].apply(list)

department_basket_unique = department_basket.apply(lambda x: list(set(x)))
aisle_basket_unique = aisle_basket.apply(lambda x: list(set(x)))

CPU times: user 1min 48s, sys: 49.5 s, total: 2min 38s
Wall time: 2min 59s


## Word2Vec models

In [72]:
## Config
WORD_VECTOR_SIZE = 8
PRODUCT_VECTOR_SIZE = 64
DEPARTMENT_VECTOR_SIZE = 2
AISLE_VECTOR_SIZE = 64

In [73]:
### Training the `Word2Vec` model based on product lemma

# Defining the maximun window
window_max = max(products['products_lemma'].apply(lambda x:len(x)))

# size=20: In order to make `Word2Vec` a little bit quicker and for memory efficiency we're going to use 20 dimensions.
# window=49: In order to make sure all words are used in training the model, we're going to set a large.
w2vec_model = Word2Vec(list(products['products_lemma']), vector_size=WORD_VECTOR_SIZE, window=window_max, min_count=1, workers=6)

### Vector calculation for products
# Loop through each product and obtain the average of each string that makes a product. <br>
# This will be the vector representation of the product. <br>
# The vector representation of the product will be used to calculate the similarity between products. <br>
# The similarity between products will be used to recommend products to the user. <br>

# Loop through each word in the product name to generate the vector.
prods_w2v = dict()
for row, product in tqdm(products.iterrows()):
    word_vector = list()
    for word in product['products_lemma']:
        word_vector.append(w2vec_model.wv[word])

    prods_w2v[product['product_id']] = np.average(word_vector, axis=0)

# Save vector values in list form to the dataframe.
products['vectors_word'] = prods_w2v.values()

49688it [00:01, 28349.08it/s]


In [38]:
%%time
### Training the `Word2Vec` model based on cart
# products which are ordered together will be closer in vector space

# Define maximum window for longest order cart
window_max = order_baskets.apply(len).max()

# w2vec model
w2vec_model = Word2Vec(list(order_baskets), vector_size=PRODUCT_VECTOR_SIZE, window=window_max, min_count=1, workers=6)

# get vectors for each product
products['vectors_product'] = products.product_id.apply(lambda x: w2vec_model.wv[x])

CPU times: user 6min 36s, sys: 11.3 s, total: 6min 47s
Wall time: 1min 44s


In [74]:
%%time
# # Define maximum window for longest order cart
window_max = department_basket.apply(len).max()

# w2vec model
w2vec_model = Word2Vec(list(department_basket), vector_size=DEPARTMENT_VECTOR_SIZE, window=window_max, min_count=1, workers=6)

# get vectors for each product
products['vectors_aisle'] = products.department_id.apply(lambda x: w2vec_model.wv[x])

# Define maximum window for longest order cart
# window_max = aisle_basket.apply(len).max()

# # w2vec model
# w2vec_model = Word2Vec(list(aisle_basket), vector_size=AISLE_VECTOR_SIZE, window=window_max, min_count=1, workers=-1)

# # get vectors for each product
# products['vectors_aisle'] = products.aisle_id.apply(lambda x: w2vec_model.wv[x])

CPU times: user 25.4 s, sys: 3.66 s, total: 29 s
Wall time: 27.2 s


In [14]:
# ## Save plotly html interactive plot of tsne visualization

# aisle = products[['aisle', 'vectors_aisle']].drop_duplicates('aisle').reset_index(drop=True)

# # # Apply t-SNE
# labels = aisle['aisle']
# vectors = np.array(aisle['vectors_aisle'].tolist())

# tsne = TSNE(n_components=2, random_state=42)
# embedded_vectors = tsne.fit_transform(vectors)

# # Create a DataFrame for the embedded vectors
# embedded_df = pd.DataFrame(embedded_vectors, columns=['Dimension 1', 'Dimension 2'])
# embedded_df['aisle'] = labels
# embedded_df = embedded_df.merge(products[['aisle', 'department']].drop_duplicates(), on='aisle', how='left')

# # Create an interactive scatter plot with Plotly
# fig = px.scatter(embedded_df, x='Dimension 1', y='Dimension 2', hover_data=['aisle'], color='department')


# fig.write_html("aisle.html")

In [75]:
## Concatenate all vectors together to form a single representation of product embedding
products['vectors'] = products.apply(lambda x: [*x['vectors_word'], *x['vectors_product'], *x['vectors_aisle']], axis=1)
# products['vectors'] = products['vectors_product']

In [84]:
## Using `annoy` model to calculate the similarity between products
def annoy_build(df, id, vector_size, metric='euclidean'):
    trees = 10
    m = AnnoyIndex(vector_size, metric=metric) 
    m.set_seed(42)
    for _, row in df.iterrows():
        m.add_item(row[id], row['vectors'])
    m.build(trees)
    return m

In [85]:
%%time
### Train `annoy` for `product` dataset

# Specify the metric to be used for computing distances. 
p = annoy_build(products, 'product_id', len(products['vectors'][0]))

CPU times: user 1.64 s, sys: 176 ms, total: 1.82 s
Wall time: 1.6 s


In [18]:
## Create order cart embeddings

## Testing vector embeddings

In [19]:
prod_dict = dict(zip(products.product_id, products.product_name))

In [171]:
# input_name = ['Hair Balance Shampoo', 'Gillette Sensor 3 Disposable Razors']
input_name = ['Stuffed Pasta Shells']
# input_name = ['Hair Balance Shampoo', 'Gillette Sensor 3 Disposable Razors']

selected = products[products.product_name.isin(input_name)]
prod_vector = np.average(selected.vectors.tolist(), axis=0)

nns_prod_w_d = p.get_nns_by_vector(prod_vector, 20 + len(input_name), include_distances=True)
sel_df = pd.DataFrame({"product_id": nns_prod_w_d[0], "distance": nns_prod_w_d[1]}).merge(products, on='product_id', how='inner')
sel_df = sel_df[~sel_df.product_id.isin(selected.product_id.tolist())]
sel_df[['product_name', 'department', 'aisle']].head(15).reset_index(drop=True)

Unnamed: 0,product_name,department,aisle
0,Chicken Corn Taquitos,frozen,frozen meals
1,Thai Style Yellow Curry Chicken,frozen,frozen meals
2,Cajun Style Chicken Alfredo,frozen,frozen meals
3,Cafe Steamers Creamy Chicken & Noodles,frozen,frozen meals
4,Steak & Jalapeno Burrito,frozen,frozen meals
5,"Burgers, Gourmet",frozen,frozen meals
6,Parmesan Garlic Bread,frozen,frozen appetizers sides
7,Small Round Cheese Ravioli,frozen,frozen meals
8,Roasted Vegetable & Goat Cheese Flatbread Crus...,frozen,frozen pizza
9,Chimichangas Beef Bean,frozen,frozen meals


## Order cart embedding

In [83]:
%%time
### Train `annoy` for `orders` dataset
order_w2v = dict()
for index, row in tqdm(order_baskets.items()):
    word_vector = list()
    for item_id in row:
        word_vector.append(p.get_item_vector(item_id))
    order_w2v[index] = np.average(word_vector, axis=0)

df_order_baskets = pd.DataFrame({'order_id': order_baskets.index, 'product_id': order_baskets.values})
df_order_baskets['vectors'] = order_w2v.values()

# Specify the metric to be used for computing distances. 
b = annoy_build(df_order_baskets, 'order_id', len(df_order_baskets['vectors'][0]))

3214874it [02:03, 25968.41it/s]


CPU times: user 1min 57s, sys: 7.61 s, total: 2min 4s
Wall time: 2min 6s


In [172]:
compose_basket_by_cart(prod_vector)

Unnamed: 0,product_name,department,aisle
0,Asian Noodles Teriyaki,international,asian foods
1,Mini Beef Corn Dogs,frozen,frozen appetizers sides
2,New York City Slices Gourmet Pizza,frozen,frozen pizza
3,Hot Buffalo Wings Potato Chips,deli,prepared meals
4,Twice Baked Potato,deli,prepared meals
5,Tuscan Inspired Uncured Two Meat Pizza,frozen,frozen meals
6,Broccoli & Cheese Couscous,dry goods pasta,instant foods
7,Chicken Empanadas,frozen,frozen appetizers sides
8,Shin Ramyun Noodle Soup,dry goods pasta,instant foods
9,Mini Crab Cakes,frozen,frozen meals


## User embedding 

In [125]:
%%time
def func_avg(x):
    return np.average(x, axis=0)

user_basket = pd.merge(df_order_baskets, orders, on="order_id", how='inner')

df_user_basket = user_basket[['user_id', 'vectors', 'product_id']]
df_user_basket = df_user_basket.groupby('user_id').agg(list)
df_user_basket['vectors'] = df_user_basket['vectors'].agg(func_avg).apply(tuple)
df_user_basket['product_id'] = df_user_basket['product_id'].agg(lambda x: list(set([item for sublist in x for item in sublist])))
df_user_basket = df_user_basket.reset_index()

CPU times: user 13.8 s, sys: 32.9 s, total: 46.7 s
Wall time: 1min


In [140]:
%%time
# Specify the metric to be used for computing distances. 
u = annoy_build(df_user_basket, 'user_id', len(df_user_basket.vectors[0]))

CPU times: user 6.97 s, sys: 434 ms, total: 7.41 s
Wall time: 5.54 s


In [158]:
%%time 
### Train `annoy` for `daytime` data
daytime_basket = pd.merge(df_order_baskets, orders, on='order_id', how='inner')
daytime_basket = daytime_basket.groupby('daytime').apply(lambda x: [list(x['vectors']), list(x['product_id'])]).apply(pd.Series)
daytime_basket.columns =['vectors','product_id']
daytime_basket['vectors'] = daytime_basket['vectors'].apply(lambda x: tuple(np.average(x, axis=0)))
daytime_basket['product_id'] = daytime_basket['product_id'].apply(lambda x: [item for sublist in x for item in sublist])
daytime_basket['product_id'] = daytime_basket['product_id'].apply(lambda x: list(set(x)))
df_daytime_basket = daytime_basket.reset_index().reset_index().rename(columns={'index':'daytime_id'})
# Specify the metric to be used for computing distances. 
d = annoy_build(df_daytime_basket, 'daytime_id', len(df_daytime_basket.vectors[0]))

CPU times: user 10.1 s, sys: 43.7 s, total: 53.7 s
Wall time: 1min 20s


## Testing Embeddings

In [173]:
# Inputs can be of multiple types
# 1. Only products input - Get centroid vector of products, then find carts that are similar and rank products
# 2. Only user_ids input - Get centroid vector of user_ids, then find carts that are similar and rank products
# 3. Both products and user_ids as input - Get products and user centroid, then find products by product and dot product with user centroid to find best products
# 4. No input - Find products based on daytime

In [190]:
product_list = [47136, 2529, 8990]
user_list = [1,2,3]
daytime = 127

In [184]:
def rank_by_euclidean(df, vector):
    df['distance'] = df['vectors'].apply(lambda x: np.linalg.norm(x - vector))
    df = df.sort_values('distance', ascending=False)
    return df

def rank_by_dot_product(df, vector):
    df['dot_prod'] = df['vectors'].apply(lambda x: np.dot(x, vector))
    df = df.sort_values('dot_prod', ascending=False)
    return df

def compose_basket_by_cart(product_vector, input = None, n_items = 15, method='euclidean', n_neighbours = 100):
    order_list = b.get_nns_by_vector(product_vector, n_items)
    fpl = []
    for order in order_list:
        fpl = fpl + order_baskets[order]
    
    fpl = list(set(fpl))

    sel_df = pd.DataFrame({"product_id": fpl}).merge(products, on='product_id', how='inner')
    if method == 'euclidean':
        sel_df = rank_by_euclidean(sel_df, prod_vector)
    else:  
        sel_df = dot_prod_rank(sel_df, prod_vector)
    if input is not None:
        sel_df = sel_df[~sel_df.product_id.isin(input)]
    sel_df = sel_df[['product_name', 'department', 'aisle']].head(n_items).reset_index(drop=True)
    return sel_df

In [201]:
def get_centroid(vector_list):
    return np.average(vector_list, axis=0)

def get_centroid_by_annoy_obj(obj_ann, obj_list):
    w2v_list = []
    for obj_id in obj_list:
        w2v_list.append(obj_ann.get_item_vector(obj_id))
    return get_centroid(w2v_list)

def get_basket_by_product_list(product_list, n_items = 15):
    selected = products[products.product_id.isin(product_list)]
    prod_vector = get_centroid(selected.vectors.tolist())
    return compose_basket_by_cart(prod_vector, input=selected.product_name.tolist(), n_items=n_items)

def get_basket_by_user_list(user_list, n_items = 15):
    user_vector = get_centroid_by_annoy_obj(u, user_list)
    return compose_basket_by_cart(user_vector, n_items=n_items)

def get_basket_by_user_product(product_list, user_list, n_items = 15):
    # Get product vector
    selected = products[products.product_id.isin(product_list)]
    prod_vector = get_centroid(selected.vectors.tolist())
    # Get 1000 nearest products
    similar_prod_1000 = p.get_nns_by_vector(prod_vector, 1000)
    sel_df = pd.DataFrame({"product_id": similar_prod_1000}).merge(products, on='product_id', how='inner')

    # Get user vector
    user_vector = get_centroid_by_annoy_obj(u, user_list)
    # Rank 1000 products by aggregated user vector
    sel_df = rank_by_euclidean(sel_df, user_vector)
    # Remove input products
    sel_df = sel_df[~sel_df.product_id.isin(selected.product_name.tolist())]
    # Return top n
    sel_df = sel_df[['product_name', 'department', 'aisle']].head(n_items).reset_index(drop=True)
    return sel_df

def get_basket_by_daytime(daytime_id):
    pass

In [215]:
len(prod_vector)

74

In [205]:
#get_basket_by_user_product(product_list, user_list)

In [206]:
#get_basket_by_product_list(product_list)

In [207]:
#get_basket_by_user_list(user_list)

## Save objects

In [208]:
%%time
### This section will save resources
### These resources can later be used by an app to run the engine

save_path = os.path.join(os.path.dirname(os.getcwd()), 'res')

def save_annoy(obj, n):
    path = os.path.join(save_path, n + ".ann")
    obj.save(path)

## Save annoy objects
save_annoy(p, "product")
save_annoy(u, "user")
save_annoy(b, "basket")
#save_annoy(d, "daytime")

CPU times: user 4.78 ms, sys: 1.22 s, total: 1.23 s
Wall time: 6.11 s


In [214]:
%%time
### Save dataframes to avoid pre-processing
def save_dataframe(obj, n):
    path = os.path.join(save_path, n + ".pkl")
    obj.to_pickle(path)
    print(path, "saved !")

save_dataframe(products, 'products')
save_dataframe(order_baskets, 'order_baskets')
#save_dataframe(df_daytime_basket, 'df_daytime_basket')

/Users/mrpapa/upwork/nlp/res/products.pkl saved !
/Users/mrpapa/upwork/nlp/res/order_baskets.pkl saved !
CPU times: user 5.99 s, sys: 1.11 s, total: 7.09 s
Wall time: 7.88 s
