In [1]:
!pip install annoy
!pip install efficient-apriori



In [2]:
!pip install art
!pip install plotly



In [1]:
import numpy as np
import pandas as pd
from pandas.core.common import flatten
from annoy import AnnoyIndex
from gensim.models import Word2Vec
from efficient_apriori import apriori
import seaborn as sns

import nltk
nltk.download('wordnet')

from tqdm import tqdm
import zipfile as zp
from art import *
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import plotly.offline as pyo
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
import plotly.express as px

# from PyDictionary import PyDictionary P
import random
import time

#import scikitplot as skplt

#to enable the inline plotting
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")
sns.color_palette("Greens", 10)

[nltk_data] Error loading wordnet: <urlopen error [Errno 60] Operation
[nltk_data]     timed out>


In [2]:
# define Seaborn color palette to use
colors = sns.color_palette("Greens", 10)

In [3]:
# define the fig size of all figures
plt.rcParams["figure.figsize"] = (10,6)
# the directory to unzip the data in
data_directory_path = '/Users/mrpapa/upwork/nlp/'

In [4]:
# Converting the days and hours from numbers to their interpretable form
import datetime
days_of_week = {0: 'Saturday', 
                1: 'Sunday', 
                2: 'Monday',
                3: 'Tuesday',
                4: 'Wednesday',
                5: 'Thursday',
                6: 'Friday'}
hour_nums = list(range(24))
hours_of_day = {hour_num:datetime.time(hour_num).strftime("%I:00 %p") for hour_num in hour_nums}

In [5]:
# Annotate text on graph
def annotate_text(p, append_to_text='%'):
    for p in ax.patches:
        txt = str(p.get_height().round(2)) + append_to_text
        txt_x = p.get_x() + p.get_width()/2.
        txt_y = 0.92*p.get_height()
        ax.text(txt_x,txt_y,txt, fontsize=12, color='Black', ha='center', va='bottom')

## Reading the Instacart dataset

In [6]:
# Reading the csv files into corresponding dataframes
aisles = pd.read_csv("data/aisles.csv")
order_products_prior = pd.read_csv("data/order_products__prior.csv")
order_products_train = pd.read_csv("data/order_products__train.csv")
products = pd.read_csv("data/products.csv")
orders = pd.read_csv("data/orders.csv")
departments = pd.read_csv("data/departments.csv")

# Replacing numbers with their corresponding day of week
# Define the categories of days of week sorted normally from Saturday to Friday
orders['order_dow'] = orders['order_dow'].replace(to_replace=days_of_week)

orders['order_daytime'] = orders['order_dow'] + orders['order_hour_of_day'].astype('str')

## Defining Constants

In [7]:
# Limiting the number of orders to process
orders_limit = 100000
# Color constants for the console
COLOR_CONSTANT = {'input': '\033[94m', 'warning': '\033[93m', 'error': '\033[91m', 'note': '\033[96m', 'end': '\033[0m'}
# Number of orders/baskets to pull similar to the requested
orders_returns = 15
# Number of dimensions of the vector annoy is going to store. 
vector_size = 64
# Number of trees for queries. When making a query the more trees the easier it is to go down the right path. 
trees = 10
# Number of product recommendation as maximum
#NUMBER_OUTPUT_PRODUCTS = 10
# Sample size for the TSNE model and plot
tsne_size = 1000
# Threshold for a minimum support
threshold = 1e-3
# Threshold for the maximun number of products to bring
threshold_top = 10
# Threshold for distance, based on the quantile calculation of the basket distances
threshold_distance= 0.1

## Merge datasets

In [8]:
products['products_mod'] = products['product_name'].str.lower()
# Clean special characters.
products['products_mod'] = products['products_mod'].str.replace('\W', ' ', regex=True)
# Split products into terms: Tokenize.
products['products_mod'] = products['products_mod'].str.split()

In [9]:
# Merge the department and aisle names into the dataframe. 
products = pd.merge(products, departments, on="department_id", how='outer')
products = pd.merge(products, aisles, on="aisle_id", how='outer')

In [10]:
# Remove synonyms here in the list
products['products_mod'] = products[['products_mod', 'aisle', 'department']].values.tolist()
products['products_mod'] = products['products_mod'].apply(lambda x:list(flatten(x)))

In [11]:
# Steam and lemmatisation of the product name
# https://stackoverflow.com/a/25082458/3780957
# https://en.wikipedia.org/wiki/Lemmatisation

lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')
products['products_lemma'] = products['products_mod'].apply(lambda row:[lemma.lemmatize(item) for item in row])
products['products_lemma'] = products['products_lemma'].apply(lambda row:[sno.stem(item) for item in row])

In [12]:
products.to_csv("lemma_product.csv", index=False)

In [13]:
### Training the `Word2Vec` model
# The `Word2Vec` model is a shallow neural network that is trained to reconstruct linguistic contexts of words. <br>
# The model takes as input a large corpus of text and produces a vector space, typically of several hundred dimensions, with each unique word in the corpus being assigned a corresponding vector in the space. <br>
# Word vectors are positioned in the vector space such that words that share common contexts in the corpus are located in close proximity to one another in the space. <br>
# The model is trained by taking each sentence in the corpus, sliding a window of fixed size over it and trying to predict the word in the middle of the window, given the words on the sides of the window as input. <br>

# Defining the maximun window
window_max = max(products['products_lemma'].apply(lambda x:len(x)))

# size=20: In order to make `Word2Vec` a little bit quicker and for memory efficiency we're going to use 20 dimensions.
# window=49: In order to make sure all words are used in training the model, we're going to set a large.
w2vec_model = Word2Vec(list(products['products_lemma']), vector_size=vector_size, window=window_max, min_count=1, workers=-1)

### Vector calculation for products
# Loop through each product and obtain the average of each string that makes a product. <br>
# This will be the vector representation of the product. <br>
# The vector representation of the product will be used to calculate the similarity between products. <br>
# The similarity between products will be used to recommend products to the user. <br>

# Loop through each word in the product name to generate the vector.
prods_w2v = dict()
for row, product in tqdm(products.iterrows()):
    word_vector = list()
    for word in product['products_lemma']:
        word_vector.append(w2vec_model.wv[word])

    prods_w2v[product['product_id']] = np.average(word_vector, axis=0)

# Save vector values in list form to the dataframe.
products['vectors'] = prods_w2v.values()

49688it [00:01, 31659.58it/s]


In [14]:
## TSNE model plot function
# The `TSNE` model is a non-linear dimensionality reduction technique that is particularly well-suited for embedding high-dimensional data into a space of two or three dimensions, which can then be visualized in a scatter plot. <br>
# Specifically, it models each high-dimensional object by a two- or three-dimensional point in such a way that similar objects are modeled by nearby points and dissimilar objects are modeled by distant points with high probability. <br>
# The `TSNE` model is trained by taking as input a matrix of pairwise similarities between objects and converting them into probabilities using a Gaussian kernel. <br>
# It then tries to minimize the Kullback–Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. <br>

def tsne_plot(df, title, color=None, product_flag=False, auto_open=True, sample_size=tsne_size):
    # Data sample, to speedup the execution
    df_tsne_data = df.sample(n=sample_size, random_state=42)

    # Train the TSNE MODEL
    tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=3500, random_state=42)
    new_values = tsne_model.fit_transform(np.array(list(df_tsne_data['vectors'])))

    # Prepare data
    x = list()
    y = list()
    for i in range(new_values.shape[0]):
        x.append(new_values[i][0])
        y.append(new_values[i][1])

    if color is not None:
        marker_ = dict(color=list(df_tsne_data[color]), colorscale='rdpu', showscale=False)
        if product_flag:
            text_ = df_tsne_data[['product_name', 'aisle', 'department']].agg('<br>'.join, axis=1)
        else:
            text_ = color + ": " +  df_tsne_data[color].astype(str)
    else:
        marker_ = text_ = None
    
    trace = go.Scatter(
        x = x,
        y = y,
        mode = 'markers',
        text = text_,
        hoverinfo = 'text',
        marker = marker_
    )

    layout = go.Layout(
        title = title,
        hovermode = 'closest',
        xaxis = dict(title='Dimension one', autorange=True),
        yaxis = dict(title='Dimension two', autorange=True))

    # Create plot
    fig = go.Figure(data=[trace], layout=layout)

    fig.show()

In [15]:
## TSNE model plot function, with selection ----

def tsne_plot2(df, title, selection, hover=None, auto_open=True, sample_size=tsne_size):

    # Data sample, to speedup the execution
    df_tsne_data = df.sample(n=sample_size, random_state=42)
    df_tsne_data['size'] = 1
    df_tsne_data['color'] = 'Others'

    selection = selection.copy()  # To avoid a warning
    selection['size'] = 5
    selection['color'] = 'Selection'

    df_tsne_data = df_tsne_data.append(selection)

    # Train the TSNE MODEL
    tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=3500, random_state=42)
    tsne_values = tsne_model.fit_transform(np.array(list(df_tsne_data['vectors'])))

    df_tsne_data['tsne-2d-one'] = tsne_values[:, 0]
    df_tsne_data['tsne-2d-two'] = tsne_values[:, 1]

    if hover is not None:
        df_tsne_data['hover'] = df_tsne_data[hover]
    else:
        df_tsne_data['hover'] = df_tsne_data[['product_name', 'aisle', 'department']].agg('<br>'.join, axis=1)

    df_tsne_data.sort_values(by='color', ascending=False, inplace=True)

    fig = px.scatter(df_tsne_data, x="tsne-2d-one", y="tsne-2d-two",
                    color='color', 
                    size="size", size_max=8,
                    title=title,
                    hover_data=['hover'],
                    labels={
                        "tsne-2d-one": "Dimension one",
                        "tsne-2d-two": "Dimension two",
                        "color": "Color reference"
                    })
    fig.show()

In [16]:
## Using `annoy` model to calculate the similarity between products
# The `annoy` model is a library to search for points in space that are close to a given query point. <br>
# It also creates large read-only file-based data structures that are mmpped into memory so that many processes may share the same data. <br>
# For our case, we will use the `annoy` model to calculate the similarity between products. <br>
# The `annoy` model is trained by taking as input a matrix of pairwise similarities between objects and converting them into probabilities using a Gaussian kernel. <br>
# It then tries to minimize the Kullback–Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. <br>

def annoy_build(df, id, metric='euclidean'):
    m = AnnoyIndex(vector_size, metric=metric) 
    m.set_seed(42)
    for _, row in df.iterrows():
        m.add_item(row[id], row['vectors'])
    m.build(trees)
    return m

In [17]:
### Train `annoy` for `product` dataset
# We need to specify ahead of time to annoy that there are 20 vector dimensions. Defined as a constant at `vector_size`.
# We also specify we want the model to find distances using `euclidean` distance.

# Specify the metric to be used for computing distances. 
p = annoy_build(products, 'product_id')
p.save("product_build.annoy")

True

In [18]:
### Train `annoy` for `orders` dataset
# In order to obtain the vector for each list we need to import the orders csv. <br>
# The order_products_prior has the order_id and the product_id (this is why we keeping product IDs as a key is useful).
# limit the number of orders we are operating on. 

orders_filter = order_products_prior[order_products_prior.order_id < orders_limit]
order_baskets = orders_filter.groupby('order_id')['product_id'].apply(list)

order_w2v = dict()
for index, row in tqdm(order_baskets.items()):
    word_vector = list()
    for item_id in row:
        word_vector.append(p.get_item_vector(item_id))
    order_w2v[index] = np.average(word_vector, axis=0)

df_order_baskets = pd.DataFrame({'order_id': order_baskets.index, 'product_id': order_baskets.values})
df_order_baskets['vectors'] = order_w2v.values()

# Specify the metric to be used for computing distances. 
b = annoy_build(df_order_baskets, 'order_id')
b.save("basket_build.annoy")

93947it [00:03, 31241.44it/s]


True

In [41]:
a = ['Coconut Creme Eggs', 'Easter Crème Eggs 4 Count, 4.8 oz pkg', 'Milk Chocolate Peanut Butter Eggs']

In [39]:
productid_map = dict(zip(products.product_name, products.product_id))

In [38]:
products.product_id.nunique() 

49688

In [37]:
products.product_name.nunique() 

49688

[4491, 11612, 20254]

In [19]:
### Train `annoy` for `user` dataset
# Creating an `annoy` object to index the `user` information

user_basket = pd.merge(df_order_baskets, orders, on="order_id", how='inner')
user_basket = user_basket.groupby('user_id').apply(lambda x: [list(x['vectors']), list(x['product_id'])]).apply(pd.Series)
user_basket.columns =['vectors','product_id']
user_basket['vectors'] = user_basket['vectors'].apply(lambda x: tuple(np.average(x, axis=0)))
user_basket['product_id'] = user_basket['product_id'].apply(lambda x: [item for sublist in x for item in sublist])
user_basket['product_id'] = user_basket['product_id'].apply(lambda x: list(set(x)))
df_user_basket = user_basket.reset_index()

# Specify the metric to be used for computing distances. 
u = annoy_build(df_user_basket, 'user_id')
u.save("user_build.annoy")

True

In [20]:
### Train 'annoy' for 'day-time' dataset
# Create day-time based order dataset
# orders['order_dow'] = orders['order_dow'].astype('str') 
# orders['order_hour_of_day'] = pd.to_datetime(orders['order_hour_of_day'].astype('str'))
# orders['order_hour_of_day'] = orders['order_hour_of_day'].apply(lambda x: x.strftime("%H"))

# orders['order_daytime'] = orders['order_dow'] + orders['order_hour_of_day']

daytime_basket = pd.merge(df_order_baskets, orders, on="order_id", how='inner')
daytime_basket = daytime_basket.groupby('order_daytime').apply(lambda x: [list(x['vectors']), list(x['product_id'])]).apply(pd.Series)
daytime_basket.columns =['vectors','product_id']
daytime_basket['vectors'] = daytime_basket['vectors'].apply(lambda x: tuple(np.average(x, axis=0)))
daytime_basket['product_id'] = daytime_basket['product_id'].apply(lambda x: [item for sublist in x for item in sublist])
daytime_basket['product_id'] = daytime_basket['product_id'].apply(lambda x: list(set(x)))
df_daytime_basket = daytime_basket.reset_index()

# Specify the metric to be used for computing distances. 
df_daytime_basket = df_daytime_basket.reset_index().rename(columns={'index':'daytime_id'})
d = annoy_build(df_daytime_basket, 'daytime_id')
d.save("daytime_build.annoy")

True

In [299]:
order_baskets.to_pickle('order_basket.pkl')

In [22]:
df_user_basket.to_csv("df_user_basket.csv", index=False)

In [23]:
df_daytime_basket.to_csv("df_daytime_basket.csv", index=False)

In [319]:
# Define the string you want to search for
search_string = 'chocol'

# Filter rows based on the presence of the search string in the list
products[products['products_lemma'].apply(lambda x: search_string in x)]

Unnamed: 0,product_id,product_name,aisle_id,department_id,products_mod,department,aisle,products_lemma,vectors
0,1,Chocolate Sandwich Cookies,61,19,"[chocolate, sandwich, cookies, cookies cakes, ...",snacks,cookies cakes,"[chocol, sandwich, cooki, cookies cak, snack]","[-3.445819e-05, -0.0009065453, -0.0018344745, ..."
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,"[gluten, free, all, natural, chocolate, chip, ...",snacks,cookies cakes,"[gluten, free, all, natur, chocol, chip, cooki...","[0.0016820944, -0.0024560867, -0.0007397007, 0..."
7,559,Cookie Chips Crunchy Dark Chocolate Chocolate ...,61,19,"[cookie, chips, crunchy, dark, chocolate, choc...",snacks,cookies cakes,"[cooki, chip, crunchi, dark, chocol, chocol, c...","[0.00056251494, -0.0016667483, -0.0023854182, ..."
16,796,Chocolate Reese's Peanut Butter Cup Creme Oreo,61,19,"[chocolate, reese, s, peanut, butter, cup, cre...",snacks,cookies cakes,"[chocol, rees, s, peanut, butter, cup, creme, ...","[-0.003945703, 0.00061626884, 0.0020643375, 0...."
18,1129,Organic Family Recipe Chocolate Chip Cookies,61,19,"[organic, family, recipe, chocolate, chip, coo...",snacks,cookies cakes,"[organ, famili, recip, chocol, chip, cooki, co...","[0.0010658684, -0.0034217073, -0.002860622, 0...."
...,...,...,...,...,...,...,...,...,...
49307,19399,Organic Semi Sweet Chocolate Chips,6,2,"[organic, semi, sweet, chocolate, chips, other...",other,other,"[organ, semi, sweet, chocol, chip, other, other]","[0.0025311585, 0.001968342, 0.0010151336, -0.0..."
49330,20897,Milk Chocolate Coconut Bar,6,2,"[milk, chocolate, coconut, bar, other, other]",other,other,"[milk, chocol, coconut, bar, other, other]","[-0.0019966848, 0.0017560007, 0.0011367133, -0..."
49410,28072,Dark Chocolate Malt Balls,6,2,"[dark, chocolate, malt, balls, other, other]",other,other,"[dark, chocol, malt, ball, other, other]","[-0.0048537985, 0.0040092133, 0.0021434398, -0..."
49442,31197,Chocolate Covered Raisins,6,2,"[chocolate, covered, raisins, other, other]",other,other,"[chocol, cover, raisin, other, other]","[-0.0069007664, 0.004802491, 0.0015015441, -0...."


In [318]:
products['products_lemma']

0            [chocol, sandwich, cooki, cookies cak, snack]
1        [nutter, butter, cooki, bite, go, pak, cookies...
2              [danish, butter, cooki, cookies cak, snack]
3        [gluten, free, all, natur, chocol, chip, cooki...
4        [mini, nilla, wafer, munch, pack, cookies cak,...
                               ...                        
49683    [organ, black, mission, fig, bulk dried fruits...
49684    [crystal, ginger, chunk, bulk dried fruits veg...
49685         [veget, chip, bulk dried fruits veget, bulk]
49686    [natur, sweet, plantain, chip, bulk dried frui...
49687    [fit, super, a, juic, cold, press, carrot, app...
Name: products_lemma, Length: 49688, dtype: object

In [None]:
## Similarity between products
### Define the function to calculate the similarity between products

# List the unique products maintaining the original order
def unique_preserve_order(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

# Sort recommendations by `lift`, and filter if the products are too close
def product_lift(basket, input = None, order_baskets=order_baskets, th_support=threshold, th_n=threshold_top, products=products):
    # Force to include the manual `input`
    recommendations = basket['product_id'].tolist()
    if input is not None:
        recommendations.extend(input)
    recommendations = set(recommendations)

    # Baskets with only the recommended products by the w2v
    order_baskets_ = order_baskets.explode()
    order_baskets_ = order_baskets_[order_baskets_.isin(recommendations)]
    order_baskets_ = order_baskets_.groupby(level=0).apply(list)
    order_baskets_ = order_baskets_.to_list()

    # Calculate `apriori` rules using a efficient library to speed up the calculation
    _, rules = apriori(order_baskets_, min_support=th_support, min_confidence=1e-2, max_length=5)
    
    # Multiple filters, but due to the lack of orders, are limiting the number of results, so a simple filter is active
    if input is not None:
        rules_rhs = filter(lambda rule: \
            not all(x in rule.rhs for x in input)
            , rules)
    else:
        rules_rhs = rules

    # Combine all the rules found in the data
    # Sorted by highest lift
    rule_combined = list()
    for rule in sorted(rules_rhs, key=lambda rule: rule.lift, reverse=True):
        # print(rule)
        rule_combined.extend(rule.rhs)

    # List the unique products maintaining the original order
    product_recommendation = unique_preserve_order(rule_combined)

    ## The following code, filters the recommendations after `lift`, based on the distance between the products
    # List of products
    prod = pd.DataFrame({'product_id': product_recommendation})
    prod_cross_join = prod.merge(prod, how='cross')
    # Calculate the distance between all the products
    prod_cross_join['distance'] = prod_cross_join.apply(lambda row: p.get_distance(row['product_id_x'], row['product_id_y']), axis=1)
    # Remove the same product (distance==0)
    prod_cross_join = prod_cross_join[prod_cross_join['distance']!=0]
    prod_cross_join.sort_values('distance', ascending=False)
    # Looking for closest products
    # Threshold for the filter, 10% of the distance (defined at `threshold_distance` constant)
    th_distance = np.quantile(prod_cross_join, threshold_distance)
    for id in product_recommendation:
        to_be_removed = prod_cross_join.loc[(prod_cross_join['product_id_x']==id) & (prod_cross_join['distance']<th_distance), 'product_id_y']
        prod_cross_join = prod_cross_join[~prod_cross_join['product_id_x'].isin(to_be_removed)]
    # List of final recommendations after the filters and thresholds
    prod_after_filtered = prod_cross_join['product_id_x'].unique()
    # Retain the order from the `lift`
    product_recommendation_filtered = pd.DataFrame({'product_recommendation': product_recommendation}).set_index('product_recommendation').loc[prod_after_filtered].reset_index()
    # Recall the products in the previous order
    product_recommendation_product = products.set_index("product_id").loc[product_recommendation_filtered['product_recommendation']].reset_index()

    return product_recommendation_product[['product_name', 'department', 'aisle']].head(th_n)

# Finds the recommended basket, based on the `Word2Vec` vector as input
def basket_recompose(w2v, b=b, order_baskets=order_baskets):
    # Search for a similar basket in `b`
    similar_baskets = b.get_nns_by_vector(w2v, orders_returns, search_k=-1, include_distances=False)
    basket_recompose = pd.DataFrame({'order_id': similar_baskets, 'product_id': order_baskets[similar_baskets].values}).explode('product_id')

    return basket_recompose

In [262]:
daytime_w2v = d.get_item_vector(0)
basket_dt = basket_recompose(daytime_w2v)

In [274]:
### Calculate baskets based on different inputs
# Based on different inputs, a different method of calculating a basket.

def basket_multi_input(product_list=[], user_list=[], daytime=None):

    basket_main = pd.DataFrame()
    input_product_list = []
    
    # Product list
    product_w2v = None
    if product_list:
        p_word_vector = list()
        for item_id in product_list:
            p_word_vector.append(p.get_item_vector(item_id))
        product_w2v = np.average(p_word_vector, axis=0)

        # Search for a similar basket in `b`
        basket_prod = basket_recompose(product_w2v)
        # Remove the manually selected products. Cleanup the output
        basket_prod = basket_prod[~basket_prod['product_id'].isin(product_list)]
        input_product_list = input_product_list + product_list
        basket_main = pd.concat([basket_main, basket_prod], axis=0)
    
    # User list
    selection_w2v = None
    if user_list:
        u_word_vector = list()
        for item_id in user_list:
            u_word_vector.append(tuple(u.get_item_vector(item_id)))
        user_w2v = np.average(u_word_vector, axis=0)
        basket = basket_recompose(user_w2v)
        # Products from the list of users
        input = df_user_basket.loc[df_user_basket['user_id'].isin(x), 'product_id']
        input = [item for sublist in input for item in sublist]
        input_product_list = input_product_list + input
        
        basket_main = pd.concat([basket_main, basket], axis=0)

    if daytime is not None:
        daytime_w2v = d.get_item_vector(daytime)
        basket_dt = basket_recompose(daytime_w2v)

        input = df_daytime_basket.loc[df_daytime_basket['daytime_id'] == daytime, 'product_id'].item()
        input_product_list = input_product_list + input
        basket_main = pd.concat([basket_main, basket_dt], axis=0)
    
    return product_lift(basket_main, input_product_list)
    
# From a list of products, recommends a basket
def basket_input_product_list(x):
    word_vector = list()
    for item_id in x:
        word_vector.append(p.get_item_vector(item_id))
    product_w2v = np.average(word_vector, axis=0)

    # Search for a similar basket in `b`
    basket = basket_recompose(product_w2v)
    # Remove the manually selected products. Cleanup the output
    basket = basket[~basket['product_id'].isin(x)]
    
    basket_input = products[products['product_id'].isin(x)]
    basket_input_names = basket_input['product_name'].values

    return product_lift(basket, x), basket_input_names, basket_input

# Form a particular user, recommends a basket. Also report the users that are similar to the input.
def basket_input_user(x):
    user_w2v = u.get_item_vector(x)
    selection_w2v = pd.DataFrame({'user_id': x, 'vectors': [tuple(user_w2v),]})

    # Search for similar users in `u`
    similar_users = u.get_nns_by_item(x, orders_returns, search_k=-1, include_distances=False)[1:]

    # Products from the user
    input = df_user_basket.loc[df_user_basket['user_id'] == x, 'product_id'][0]
    products_user_input = products[products['product_id'].isin(input)]
    products_user_input_name = products_user_input['product_name'].tolist()

    # Search for a similar basket in `b`
    basket = basket_recompose(user_w2v)
    return product_lift(basket, input), similar_users, selection_w2v, products_user_input_name

# From a list of users, recommends a basket
def basket_input_user_list(x):
    word_vector = list()
    for item_id in x:
        word_vector.append(tuple(u.get_item_vector(item_id)))
    user_w2v = np.average(word_vector, axis=0)
    # Selected users
    selection_w2v = pd.DataFrame({'user_id': list(x,), 'vectors': list(word_vector,)})

    # Products from the list of users
    input = df_user_basket.loc[df_user_basket['user_id'].isin(x), 'product_id']
    input = [item for sublist in input for item in sublist]
    products_user_input = products[products['product_id'].isin(input)]
    products_user_input_name = products_user_input['product_name'].tolist()

    # Search for a similar basket in `b`
    basket = basket_recompose(user_w2v)
    return product_lift(basket, input), x, selection_w2v, products_user_input_name

In [281]:
%%time
x = [33120, 28985, 9327]
df = basket_multi_input([33120, 28985, 9327], [206205, 4])

CPU times: user 1 s, sys: 170 ms, total: 1.17 s
Wall time: 1.62 s


In [279]:
%%time
x = [33120]
df = basket_multi_input([33120], [206205])

CPU times: user 576 ms, sys: 130 ms, total: 706 ms
Wall time: 778 ms


In [282]:
df

Unnamed: 0,product_name,department,aisle
0,Organic Garlic,produce,fresh vegetables
1,Limes,produce,fresh fruits
2,Organic Hass Avocado,produce,fresh fruits
3,Green Bell Pepper,produce,fresh vegetables
4,Organic Low Sodium Chicken Broth,canned goods,soup broth bouillon
5,Feta Cheese Crumbles,dairy eggs,packaged cheese
6,Organic Whole Milk,dairy eggs,milk
7,Organic Extra Firm Tofu,deli,tofu meat alternatives
8,Total Greek Strained Yogurt,dairy eggs,yogurt
9,Uncured Genoa Salami,deli,lunch meat


In [266]:
basket_prod = basket_recompose(pw)
basket_prod = basket_prod[~basket_prod['product_id'].isin(x)]

basket_user = basket_recompose(uw)
basket_user = basket_user[~basket_user['product_id'].isin(x)]

In [31]:
s = AnnoyIndex()

TypeError: function missing required argument 'f' (pos 1)

In [30]:
s = AnnoyIndex.load("basket_build.annoy")

TypeError: descriptor 'load' for 'annoy.Annoy' objects doesn't apply to a 'str' object

In [270]:
pd.concat([basket_user, basket_prod], axis=0)


Unnamed: 0,order_id,product_id
0,30805,945
0,30805,23085
0,30805,2469
0,30805,32137
0,30805,21026
...,...,...
14,35940,27521
14,35940,17794
14,35940,44683
14,35940,45200


In [267]:
basket_prod

Unnamed: 0,order_id,product_id
0,8353,48775
0,8353,23165
0,8353,13114
0,8353,36144
0,8353,45044
...,...,...
14,35940,27521
14,35940,17794
14,35940,44683
14,35940,45200


In [245]:
%%time
product_lift(basket, x)

CPU times: user 753 ms, sys: 106 ms, total: 860 ms
Wall time: 888 ms


Unnamed: 0,product_name,department,aisle
0,Organic Rosemary,produce,fresh herbs
1,Limes,produce,fresh fruits
2,Organic Hass Avocado,produce,fresh fruits
3,Organic Zucchini,produce,fresh vegetables
4,Green Bell Pepper,produce,fresh vegetables
5,Organic Low Sodium Chicken Broth,canned goods,soup broth bouillon
6,Feta Cheese Crumbles,dairy eggs,packaged cheese
7,Organic Extra Firm Tofu,deli,tofu meat alternatives
8,Organic Whole Milk,dairy eggs,milk
9,Total Greek Strained Yogurt,dairy eggs,yogurt


In [None]:
### Run user interface ----
clear_console()

INPUT_TYPES = {1: 'Input product [Using name]', 2: 'Input product [Using ID]', 3: 'Input user', 4: 'Input users list', 5: 'Input order', 6: 'TSNE plots', 7: 'Auto EDA', 8: 'Orders and frequency EDA', 9: 'Exit'}
select_continue = 1  # Start the loop

# Infinite loop, will stop  by pressing Ctrl+C or selecting 'Exit' when prompt
while select_continue < max(INPUT_TYPES.keys()):
    # Prompt the user to continue or stop the infinite rounds
    select_continue = read_positive(message_try='Choose the type of input %s: ' % (
                                        ' '.join('\n{}: {}'.format(k, v) for k, v in INPUT_TYPES.items())),
                                    message_error='Only accepts the values %s, try again.' % (tuple(INPUT_TYPES.keys()),)
                                    )

    if select_continue < max(INPUT_TYPES.keys()) :
        clear_console()
        tprint(INPUT_TYPES[select_continue], "Standard")

    # Selection based on the key from `INPUT_TYPES`
    if select_continue == 1:
        # Multiple entries until the user press `Enter` to finish
        # https://www.geeksforgeeks.org/python-get-a-list-as-input-from-user/
        try:
            select_list = list()
            while True:
                select_ = read_product_name()
                if select_:
                    select_list.append(select_)
                else:
                    raise ValueError
        except:
            out_ = basket_input_product_list(select_list)
            print_color('Selected:')
            print(out_[1])
            print('\n')
            print_color('Recommendation:')
            print(out_[0])
            print('\n')
            print_color('Plot will open in Internet browser, please wait...')
            tsne_plot2(products, title='Selected `products` between others', selection=out_[2])

    if select_continue == 2:
        # Multiple entries until the user press `Enter` to finish
        # https://www.geeksforgeeks.org/python-get-a-list-as-input-from-user/
        try:
            select_list = list()
            while True:
                select_ = read_positive(message_try='Product number [ENTER to finish]: ', allow_enter=True)
                select_list.append(select_)
        except:
            out_ = basket_input_product_list(select_list)
            print_color('Selected:')
            print(out_[1])
            print('\n')
            print_color('Recommendation:')
            print(out_[0])
            print('\n')
            print_color('Plot will open in Internet browser, please wait...')
            tsne_plot2(products, title='Selected `products` between others', selection=out_[2])

    elif select_continue == 3:
        select_ = read_positive(message_try='Reference user: ')
        out_ = basket_input_user(select_)
        print_color('Similar to users:')
        print(out_[1])
        print('\n')
        print_color('Products purchased in previous orders:')
        print(out_[3])
        print('\n')
        print_color('Recommendation:')
        print(out_[0])
        print('\n')
        print_color('Plot will open in Internet browser, please wait...')
        tsne_plot2(df_user_basket, title='Selected `user` between others', selection=out_[2], hover='user_id')

    elif select_continue == 4:
        try:
            select_list = list()
            while True:
                select_ = read_positive(message_try='User number [e.g., [23, 27, 66]. ENTER to finish]: ', allow_enter=True)
                select_list.append(select_)
        except:
            out_ = basket_input_user_list(select_list)
            print_color('Selected:')
            print(out_[1])
            print('\n')
            print_color('Products purchased in previous orders:')
            print(out_[3])
            print('\n')
            print_color('Recommendation:')
            print(out_[0])
            print('\n')
            print_color('Plot will open in Internet browser, please wait...')
            tsne_plot2(df_user_basket, title='Selected `users` between others', selection=out_[2], hover='user_id')

    elif select_continue == 5:
        select_ = read_positive(message_try='Reference order: ')
        out_ = basket_input_order(select_)
        print_color('Products purchased in previous orders:')
        print(out_[2])
        print('\n')
        print_color('Recommendation:')
        print(out_[0])
        print('\n')
        print_color('Plot will open in Internet browser, please wait...')
        tsne_plot2(df_order_baskets, title='Selected `order` between others', selection=out_[1], hover='order_id')

    elif select_continue == 6:
        print_color('Please wait')
        # Create 3 different plots using TSNE algorithm
        tsne_plot(products, title='Products', color='department_id', product_flag=True)
        tsne_plot(df_user_basket, title='User average', color='user_id')
        tsne_plot(df_order_baskets, title='Order average', color='order_id')
        print_color('Done. Check your Internet browser.')

    elif select_continue == 7:
        print_color('Please wait')
        ProfileReport(products[['product_id', 'product_name', 'department', 'aisle']], title="Exploratory Data Analysis: `Products`").to_file("EDA/eda_products.html")
        ProfileReport(orders, title="Exploratory Data Analysis: `Orders and Users`").to_file("EDA/eda_orders_users.html")
        ProfileReport(orders_filter, title="Exploratory Data Analysis: `Orders and Products` (filtered)").to_file("EDA/eda_orders_products.html")
        print_color('Done. Check `EDA` folder.')

    elif select_continue == 8:
        print_color('Check popup window')
        eda(order_baskets, orders_filter, products)
