In [1]:
!pip install annoy
!pip install efficient-apriori



In [2]:
!pip install art
!pip install plotly



In [3]:
import os

import numpy as np
import pandas as pd
from pandas.core.common import flatten
from annoy import AnnoyIndex
from gensim.models import Word2Vec
from efficient_apriori import apriori
import seaborn as sns

import nltk
nltk.download('wordnet')

from tqdm import tqdm
import zipfile as zp
from art import *
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import plotly.offline as pyo
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
import plotly.express as px

# from PyDictionary import PyDictionary P
import random
import time

#import scikitplot as skplt

#to enable the inline plotting
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /Users/mrpapa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Converting the days and hours from numbers to their interpretable form
import datetime
days_of_week = {0: 'Saturday', 
                1: 'Sunday', 
                2: 'Monday',
                3: 'Tuesday',
                4: 'Wednesday',
                5: 'Thursday',
                6: 'Friday'}
hour_nums = list(range(24))
hours_of_day = {hour_num:datetime.time(hour_num).strftime("%I:00 %p") for hour_num in hour_nums}

## Reading the Instacart dataset

In [5]:
# Change to location of data (Instacart Market Basket Analysis Kaggle data)
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')

In [6]:
# Reading the csv files into corresponding dataframes
# Load products, aisles and departments
products = pd.read_csv(os.path.join(data_dir, "products.csv"))
aisles = pd.read_csv(os.path.join(data_dir, "aisles.csv"))
departments = pd.read_csv(os.path.join(data_dir, "departments.csv"))

# Load orders dataset
orders = pd.read_csv(os.path.join(data_dir, "orders.csv"))
order_products_prior = pd.read_csv(os.path.join(data_dir, "order_products__prior.csv"))
order_products_train = pd.read_csv(os.path.join(data_dir, "order_products__train.csv"))

# Replacing numbers with their corresponding hour representation
# orders['order_hour_of_day'] = orders['order_hour_of_day'].replace(to_replace=hours_of_day)
# orders['order_hour_of_day'] = pd.Categorical(orders['order_hour_of_day'], 
#                                              ordered=True, 
#                                              categories=list(hours_of_day.values()))

# Replacing numbers with their corresponding day of week
# Define the categories of days of week sorted normally from Saturday to Friday
orders['order_dow'] = orders['order_dow'].replace(to_replace=days_of_week)
orders['order_dow'] = pd.Categorical(orders['order_dow'], 
                                     ordered=True, 
                                     categories=list(days_of_week.values()))

orders['daytime'] = orders['order_dow'].astype('str') + orders['order_hour_of_day'].astype('str')

## Defining Constants

In [7]:
# Maximum order value (+1 for limit)
MAX_ORDER_LIMIT = orders.order_id.max() + 1
# Limiting the number of orders to process
orders_limit = MAX_ORDER_LIMIT # use 10000/100000 for limits or MAX ORDER
# Number of orders/baskets to pull similar to the requested
orders_returns = 15
# Number of dimensions of the vector annoy is going to store. 
vector_size = 64
# Number of trees for queries. When making a query the more trees the easier it is to go down the right path. 
trees = 10
# Number of product recommendation as maximum
NUMBER_OUTPUT_PRODUCTS = 10

# Sample size for the TSNE model and plot
tsne_size = 1000
# Threshold for a minimum support
threshold = 1e-3
# Threshold for the maximun number of products to bring
threshold_top = 10
# Threshold for distance, based on the quantile calculation of the basket distances
threshold_distance= 0.1

## Merge datasets

In [8]:
products['products_mod'] = products['product_name'].str.lower()
# Clean special characters.
products['products_mod'] = products['products_mod'].str.replace('\W', ' ', regex=True)
# Split products into terms: Tokenize.
products['products_mod'] = products['products_mod'].str.split()
# Merge the department and aisle names into the dataframe. 
products = pd.merge(products, departments, on="department_id", how='outer')
products = pd.merge(products, aisles, on="aisle_id", how='outer')
# Remove synonyms here in the list
products['products_mod'] = products[['products_mod', 'aisle', 'department']].values.tolist()
products['products_mod'] = products['products_mod'].apply(lambda x:list(flatten(x)))

In [9]:
# Steam and lemmatisation of the product name
# https://stackoverflow.com/a/25082458/3780957
# https://en.wikipedia.org/wiki/Lemmatisation

lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')
products['products_lemma'] = products['products_mod'].apply(lambda row:[lemma.lemmatize(item) for item in row])
products['products_lemma'] = products['products_lemma'].apply(lambda row:[sno.stem(item) for item in row])

## Word2Vec models

In [10]:
### Training the `Word2Vec` model based on product lemma

# Defining the maximun window
window_max = max(products['products_lemma'].apply(lambda x:len(x)))

# size=20: In order to make `Word2Vec` a little bit quicker and for memory efficiency we're going to use 20 dimensions.
# window=49: In order to make sure all words are used in training the model, we're going to set a large.
w2vec_model = Word2Vec(list(products['products_lemma']), vector_size=vector_size, window=window_max, min_count=1, workers=-1)

### Vector calculation for products
# Loop through each product and obtain the average of each string that makes a product. <br>
# This will be the vector representation of the product. <br>
# The vector representation of the product will be used to calculate the similarity between products. <br>
# The similarity between products will be used to recommend products to the user. <br>

# Loop through each word in the product name to generate the vector.
prods_w2v = dict()
for row, product in tqdm(products.iterrows()):
    word_vector = list()
    for word in product['products_lemma']:
        word_vector.append(w2vec_model.wv[word])

    prods_w2v[product['product_id']] = np.average(word_vector, axis=0)

# Save vector values in list form to the dataframe.
# products['vectors'] = prods_w2v.values()

49688it [00:01, 31370.70it/s]


In [11]:
%%time
### Training the `Word2Vec` model based on cart
# products which are ordered together will be closer in vector space

# Get orders dataset to extract cart data
orders_filter = order_products_prior[order_products_prior.order_id < orders_limit]
order_baskets = orders_filter.groupby('order_id')['product_id'].apply(list)

# Define maximum window for longest order cart
window_max = order_baskets.apply(len).max()

# w2vec model
w2vec_model = Word2Vec(list(order_baskets), vector_size=vector_size, window=window_max, min_count=1, workers=-1)

# get vectors for each product
products['vectors'] = products.product_id.apply(lambda x: w2vec_model.wv[x])

CPU times: user 38.1 s, sys: 8.11 s, total: 46.2 s
Wall time: 49.8 s


## Calculate Approximate Nearest Neighbours (using ANNOY)

In [12]:
## Using `annoy` model to calculate the similarity between products
# The `annoy` model is a library to search for points in space that are close to a given query point. <br>
# It also creates large read-only file-based data structures that are mmpped into memory so that many processes may share the same data. <br>
# For our case, we will use the `annoy` model to calculate the similarity between products. <br>
# The `annoy` model is trained by taking as input a matrix of pairwise similarities between objects and converting them into probabilities using a Gaussian kernel. <br>
# It then tries to minimize the Kullback–Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. <br>

def annoy_build(df, id, metric='euclidean'):
    m = AnnoyIndex(vector_size, metric=metric) 
    m.set_seed(42)
    for _, row in df.iterrows():
        m.add_item(row[id], row['vectors'])
    m.build(trees)
    return m

def build_hnsw(df, id ,metric='euclidean'):
    pass

In [13]:
%%time
### Train `annoy` for `product` dataset
# We need to specify ahead of time to annoy that there are 20 vector dimensions. Defined as a constant at `vector_size`.
# We also specify we want the model to find distances using `euclidean` distance.

# Specify the metric to be used for computing distances. 
p = annoy_build(products, 'product_id')

CPU times: user 4.79 s, sys: 324 ms, total: 5.11 s
Wall time: 4.81 s


In [14]:
%%time
### Train `annoy` for `orders` dataset
order_w2v = dict()
for index, row in tqdm(order_baskets.items()):
    word_vector = list()
    for item_id in row:
        word_vector.append(p.get_item_vector(item_id))
    order_w2v[index] = np.average(word_vector, axis=0)

df_order_baskets = pd.DataFrame({'order_id': order_baskets.index, 'product_id': order_baskets.values})
df_order_baskets['vectors'] = order_w2v.values()

# Specify the metric to be used for computing distances. 
b = annoy_build(df_order_baskets, 'order_id')

3214874it [01:47, 29911.61it/s]


CPU times: user 3min 58s, sys: 13.4 s, total: 4min 11s
Wall time: 3min 21s


In [15]:
%%time
### Train `annoy` for `user` dataset
# Creating an `annoy` object to index the `user` information
user_basket = pd.merge(df_order_baskets, orders, on="order_id", how='inner')
user_basket = user_basket.groupby('user_id').apply(lambda x: [list(x['vectors']), list(x['product_id'])]).apply(pd.Series)
user_basket.columns =['vectors','product_id']
user_basket['vectors'] = user_basket['vectors'].apply(lambda x: tuple(np.average(x, axis=0)))
user_basket['product_id'] = user_basket['product_id'].apply(lambda x: [item for sublist in x for item in sublist])
user_basket['product_id'] = user_basket['product_id'].apply(lambda x: list(set(x)))
df_user_basket = user_basket.reset_index()

# Specify the metric to be used for computing distances. 
u = annoy_build(df_user_basket, 'user_id')

CPU times: user 42.8 s, sys: 1min 39s, total: 2min 22s
Wall time: 2min 55s


In [16]:
%%time 
### Train `annoy` for `daytime` data
daytime_basket = pd.merge(df_order_baskets, orders, on='order_id', how='inner')
daytime_basket = daytime_basket.groupby('daytime').apply(lambda x: [list(x['vectors']), list(x['product_id'])]).apply(pd.Series)
daytime_basket.columns =['vectors','product_id']
daytime_basket['vectors'] = daytime_basket['vectors'].apply(lambda x: tuple(np.average(x, axis=0)))
daytime_basket['vectors_list'] = daytime_basket['vectors'].apply(list)
daytime_basket['product_id'] = daytime_basket['product_id'].apply(lambda x: [item for sublist in x for item in sublist])
daytime_basket['product_id'] = daytime_basket['product_id'].apply(lambda x: list(set(x)))
df_daytime_basket = daytime_basket.reset_index().reset_index().rename(columns={'index':'daytime_id'})
# Specify the metric to be used for computing distances. 
d = annoy_build(df_daytime_basket, 'daytime_id')

CPU times: user 9.52 s, sys: 32.7 s, total: 42.2 s
Wall time: 50.3 s


### Profile Daytime embeddings using tSNE

In [17]:
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE
# import plotly.express as px

# daytime_index = orders[['order_dow', 'order_hour_of_day', 'daytime']].drop_duplicates().sort_values('order_dow').reset_index(drop = True).reset_index().rename(columns={'index':'daytime_id'})

# day_shape_mapping = {
#     'Monday': 'circle',
#     'Tuesday': 'square',
#     'Wednesday': 'diamond',
#     'Thursday': 'cross',
#     'Friday': 'star',
#     'Saturday': 'triangle-up',
#     'Sunday': 'triangle-down'
# }

# # Extract labels and vectors
# labels = df_daytime_basket['daytime']
# vectors = np.array(daytime_basket['vectors'].tolist())

# # Apply t-SNE
# tsne = TSNE(n_components=2, random_state=42)
# embedded_vectors = tsne.fit_transform(vectors)

# # Create a DataFrame for the embedded vectors
# embedded_df = pd.DataFrame(embedded_vectors, columns=['Dimension 1', 'Dimension 2'])
# embedded_df['daytime'] = labels
# embedded_df = embedded_df.merge(daytime_index, on='daytime')
# embedded_df['Shape'] = embedded_df['order_dow'].apply(lambda x: day_shape_mapping[x])

# # Create an interactive scatter plot with Plotly
# fig = px.scatter(embedded_df, x='Dimension 1', y='Dimension 2', color='order_hour_of_day', symbol='order_dow', hover_data=['daytime'])

# # Update plot title and axis labels
# fig.update_layout(
#     title='t-SNE Visualization',
#     xaxis_title='Dimension 1',
#     yaxis_title='Dimension 2',
#     showlegend=True,
#     coloraxis_colorbar=dict(yanchor="top", y=1, x=0,
#                                           ticks="outside")
# )

# fig.write_html("daytime_profile.html")

## Association Rule Mining to get best recommendations

In [75]:
## Similarity between products
### Define the function to calculate the similarity between products

# List the unique products maintaining the original order
def unique_preserve_order(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

# Sort recommendations by `lift`, and filter if the products are too close
def product_lift(basket, input = None, order_baskets=order_baskets, th_support=threshold, th_n=threshold_top, products=products):
    # Force to include the manual `input`
    recommendations = basket['product_id'].tolist()
    if input is not None:
        recommendations.extend(input)
    recommendations = set(recommendations)
    
    # Get all instances where either 1 or many products in recommendations were ordered together
    # Identify all orders where atleast 1 recommended product is available
    df_ = order_baskets[order_baskets.apply(lambda x: any(i in recommendations for i in x))].tolist()
    # For each order cart, only keep recommended products in cart
    order_baskets_ = [[i for i in sublist if i in recommendations] for sublist in df_]

    # Calculate `apriori` rules using a efficient library to speed up the calculation
    _, rules = apriori(order_baskets_, min_support=th_support, min_confidence=1e-2, max_length=5)
    
    # Multiple filters, but due to the lack of orders, are limiting the number of results, so a simple filter is active
    if input is not None:
        rules_rhs = filter(lambda rule: \
            not all(x in rule.rhs for x in input)
            , rules)
    else:
        rules_rhs = rules

    # Combine all the rules found in the data
    # Sorted by highest lift
    rule_combined = list()
    for rule in sorted(rules_rhs, key=lambda rule: rule.lift, reverse=True):
        # print(rule)
        rule_combined.extend(rule.rhs)

    # List the unique products maintaining the original order
    product_recommendation = unique_preserve_order(rule_combined)

    ## The following code, filters the recommendations after `lift`, based on the distance between the products
    # List of products
    prod = pd.DataFrame({'product_id': product_recommendation})
    prod_cross_join = prod.merge(prod, how='cross')
    # Calculate the distance between all the products
    prod_cross_join['distance'] = prod_cross_join.apply(lambda row: p.get_distance(row['product_id_x'], row['product_id_y']), axis=1)
    # Remove the same product (distance==0)
    prod_cross_join = prod_cross_join[prod_cross_join['distance']!=0]
    prod_cross_join.sort_values('distance', ascending=False)
    # Looking for closest products
    # Threshold for the filter, 10% of the distance (defined at `threshold_distance` constant)
    th_distance = np.quantile(prod_cross_join, threshold_distance)
    for id in product_recommendation:
        to_be_removed = prod_cross_join.loc[(prod_cross_join['product_id_x']==id) & (prod_cross_join['distance']<th_distance), 'product_id_y']
        prod_cross_join = prod_cross_join[~prod_cross_join['product_id_x'].isin(to_be_removed)]
    # List of final recommendations after the filters and thresholds
    prod_after_filtered = prod_cross_join['product_id_x'].unique()
    # Retain the order from the `lift`
    product_recommendation_filtered = pd.DataFrame({'product_recommendation': product_recommendation}).set_index('product_recommendation').loc[prod_after_filtered].reset_index()
    # Recall the products in the previous order
    product_recommendation_product = products.set_index("product_id").loc[product_recommendation_filtered['product_recommendation']].reset_index()

    return product_recommendation_product[['product_name', 'department', 'aisle']].head(th_n)

# Finds the recommended basket, based on the `Word2Vec` vector as input
def basket_recompose(w2v, b=b, order_baskets=order_baskets):
    # Search for a similar basket in `b`
    similar_baskets = b.get_nns_by_vector(w2v, orders_returns, search_k=-1, include_distances=False)
    basket_recompose = pd.DataFrame({'order_id': similar_baskets, 'product_id': order_baskets[similar_baskets].values}).explode('product_id')

    return basket_recompose

In [127]:
def get_simple_recommendation(input_vector):
    product_list = p.get_nns_by_vector(input_vector, n=15)
    return products[products.product_id.isin(product_list)][['product_name', 'department', 'aisle']].reset_index(drop=True)
    
def filter_dt_recommendation(x_product = [], x_user = [], daytime_id = None):

    input = None
    user_basket = None
    product_basket = None
    final_vector_list = list()
    
    basket = pd.DataFrame()
    if x_user:
        word_vector = list()
        for user in x_user:
            word_vector.append(tuple(u.get_item_vector(user)))
        user_w2v = np.average(word_vector, axis=0)
        final_vector_list.append(user_w2v)
        
        user_basket = basket_recompose(user_w2v)
        basket = pd.concat([basket, user_basket], axis=0)

    if x_product:
        word_vector = list()
        for item_id in x_product:
            word_vector.append(p.get_item_vector(item_id))
        product_w2v = np.average(word_vector, axis=0)
        final_vector_list.append(product_w2v)
        
        similar_products = p.get_nns_by_vector(product_w2v, 100 + len(x_product), search_k=-1, include_distances=False)
        product_basket = pd.DataFrame({'order_id': 0, 'product_id': similar_products})
        product_basket = product_basket[~product_basket['product_id'].isin(x_product)]
        basket = pd.concat([basket, product_basket], axis=0)
        input = x_product

    basket = basket.reset_index(drop=True).drop_duplicates('product_id')

    # If daytime is available, filter those products which were ever sold in that daytime + 4 similar daytimes
    if daytime_id is not None:
        DAYTIME_NEIGHBOURS = 10
        similar_daytime = d.get_nns_by_item(daytime_id, n=DAYTIME_NEIGHBOURS)
        filter_list_of_list = df_daytime_basket[df_daytime_basket.daytime_id.isin(similar_daytime)]['product_id'].tolist()
        filter_list = list(set([i for sublist in filter_list_of_list for i in sublist]))
        basket = basket[basket.product_id.isin(filter_list)]

    if len(final_vector_list) > 1:
        final_vector = np.average(final_vector_list, axis=0)
    else:
        final_vector = final_vector_list[0]
    
    try:
        return product_lift(basket, input), user_basket, product_basket, basket, final_vector
    except Exception as e:
        print(e)
        return get_simple_recommendation(final_vector), user_basket, product_basket, basket, final_vector

In [126]:
%%time
# basket = filter_dt_recommendation(x_product = [26405, 46149], x_user = [1], daytime_id=165)
df, userb, prodb, basket, input_vector = filter_dt_recommendation(x_product = [26405, 46149], x_user = [1], daytime_id=165)
for prod in [26405, 46149]:
    print(products[products.product_id == prod]['product_name'].item())
rec

XL Pick-A-Size Paper Towel Rolls
Zero Calorie Cola
CPU times: user 9.54 s, sys: 34.6 s, total: 44.1 s
Wall time: 51.6 s


Unnamed: 0,product_name,department,aisle
0,Soda,beverages,soft drinks
1,Zero Calorie Cola,beverages,soft drinks
2,Strawberries,produce,fresh fruits
3,Organic Whole String Cheese,dairy eggs,packaged cheese
4,Organic Strawberries,produce,fresh fruits
5,Organic Whole Milk,dairy eggs,milk
6,No Salt Added Black Beans,canned goods,canned meals beans
7,Organic Large Extra Fancy Fuji Apple,produce,fresh fruits
8,Shredded Parmesan,dairy eggs,packaged cheese
9,Organic Turkey Bacon,meat seafood,hot dogs bacon sausage


## Save resources

In [21]:
%%time
### This section will save resources
### These resources can later be used by an app to run the engine

save_path = os.path.join(os.path.dirname(os.getcwd()), 'res')

def save_annoy(obj, n):
    path = os.path.join(save_path, n + ".ann")
    obj.save(path)

## Save annoy objects
save_annoy(p, "product")
save_annoy(u, "user")
save_annoy(b, "basket")
save_annoy(d, "daytime")

CPU times: user 3.76 ms, sys: 1.12 s, total: 1.13 s
Wall time: 6.01 s


In [79]:
%%time
### Save dataframes to avoid pre-processing
def save_dataframe(obj, n):
    path = os.path.join(save_path, n + ".pkl")
    obj.to_pickle(path)
    print(path, "saved !")

save_dataframe(products, 'products')
save_dataframe(order_baskets, 'order_baskets')
save_dataframe(df_daytime_basket, 'df_daytime_basket')

/Users/mrpapa/upwork/nlp/res/products.pkl saved !
/Users/mrpapa/upwork/nlp/res/order_baskets.pkl saved !
/Users/mrpapa/upwork/nlp/res/df_daytime_basket.pkl saved !
CPU times: user 1.57 s, sys: 3.25 s, total: 4.82 s
Wall time: 5.73 s


In [123]:
product_list = [47136, 2529, 8990]
users = []
daytime = 127

In [124]:
df, userb, prodb, basket, input_vector =  filter_dt_recommendation(product_list, users, daytime)

Cannot set a DataFrame with multiple columns to the single column distance


In [125]:
df

Unnamed: 0,product_name,department,aisle
1963,Veggie Stix,snacks,chips pretzels
4658,SeriousMilk Classic Milk Chocolate Bar,snacks,candy chocolate
5017,Nuggets Milk Chocolate With Almond,snacks,candy chocolate
5924,"Macadamias, Sea Salt & Cracked Pepper",snacks,nuts seeds dried fruit
10646,Pumpkin Pie Filling,pantry,baking ingredients
12295,Superfruit Pomegranate Green Tea,beverages,tea
18127,Cashew Milk Cappuccino Non-Dairy Frozen Dessert,frozen,ice cream ice
19052,Premium Ice,frozen,ice cream ice
23738,Cherry Flavor Primadophilus Kids Chewables,personal care,digestion
23914,5 Symptom Digestive Relief,personal care,digestion
