In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

from pandas.core.common import flatten
import nltk

from tqdm import tqdm

In [2]:
orders = pd.read_csv("data/orders.csv") 

In [3]:
# Limiting the number of orders to process
orders_limit = 10000
# Color constants for the console
COLOR_CONSTANT = {'input': '\033[94m', 'warning': '\033[93m', 'error': '\033[91m', 'note': '\033[96m', 'end': '\033[0m'}
# Number of orders/baskets to pull similar to the requested
orders_returns = 15
# Number of dimensions of the vector annoy is going to store. 
vector_size = 25
# Number of trees for queries. When making a query the more trees the easier it is to go down the right path. 
trees = 10
# Number of product recommendation as maximum
#NUMBER_OUTPUT_PRODUCTS = 10
# Sample size for the TSNE model and plot
tsne_size = 1000
# Threshold for a minimum support
threshold = 1e-3
# Threshold for the maximun number of products to bring
threshold_top = 10
# Threshold for distance, based on the quantile calculation of the basket distances
threshold_distance= 0.1

In [4]:
products = pd.read_csv("data/products.csv")
products['product_name'] = products.product_name.str.lower()
product_dict = dict(zip(products.product_id, products.product_name))

In [184]:
products['products_mod'] = products['product_name'].str.lower()
# Clean special characters.
products['products_mod'] = products['products_mod'].str.replace('\W', ' ', regex=True)
# Split products into terms: Tokenize.
products['products_mod'] = products['products_mod'].str.split()

# Merge the department and aisle names into the dataframe. 
products = pd.merge(products, departments, on="department_id", how='outer')
products = pd.merge(products, aisles, on="aisle_id", how='outer')

# Remove synonyms here in the list
products['products_mod'] = products[['products_mod', 'aisle', 'department']].values.tolist()
products['products_mod'] = products['products_mod'].apply(lambda x:list(flatten(x)))

In [186]:
# Steam and lemmatisation of the product name
# https://stackoverflow.com/a/25082458/3780957
# https://en.wikipedia.org/wiki/Lemmatisation

lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')
products['products_lemma'] = products['products_mod'].apply(lambda row:[lemma.lemmatize(item) for item in row])
products['products_lemma'] = products['products_lemma'].apply(lambda row:[sno.stem(item) for item in row])

In [193]:
products.loc[0, ['products_lemma', 'products_mod']]

products_lemma        [chocol, sandwich, cooki, cookies cak, snack]
products_mod      [chocolate, sandwich, cookies, cookies cakes, ...
Name: 0, dtype: object

In [204]:
products

Unnamed: 0,product_id,product_name,aisle_id,department_id,products_mod,department,aisle,products_lemma,vectors
0,1,chocolate sandwich cookies,61,19,"[chocolate, sandwich, cookies, cookies cakes, ...",snacks,cookies cakes,"[chocol, sandwich, cooki, cookies cak, snack]","[0.0048274104, -0.000982171, 0.0135972, 0.0127..."
1,78,nutter butter cookie bites go-pak,61,19,"[nutter, butter, cookie, bites, go, pak, cooki...",snacks,cookies cakes,"[nutter, butter, cooki, bite, go, pak, cookies...","[0.00053272134, -0.0016033737, 0.010740572, 0...."
2,102,danish butter cookies,61,19,"[danish, butter, cookies, cookies cakes, snacks]",snacks,cookies cakes,"[danish, butter, cooki, cookies cak, snack]","[0.007038258, -0.005356223, 0.013606298, 0.012..."
3,172,gluten free all natural chocolate chip cookies,61,19,"[gluten, free, all, natural, chocolate, chip, ...",snacks,cookies cakes,"[gluten, free, all, natur, chocol, chip, cooki...","[0.0021328337, 0.0015880795, 0.003318248, 0.00..."
4,285,mini nilla wafers munch pack,61,19,"[mini, nilla, wafers, munch, pack, cookies cak...",snacks,cookies cakes,"[mini, nilla, wafer, munch, pack, cookies cak,...","[0.0013800311, -0.0027976793, 0.014260448, 0.0..."
...,...,...,...,...,...,...,...,...,...
49683,22827,organic black mission figs,18,10,"[organic, black, mission, figs, bulk dried fru...",bulk,bulk dried fruits vegetables,"[organ, black, mission, fig, bulk dried fruits...","[0.0052806833, -0.00082850707, -0.00867612, -0..."
49684,28655,crystallized ginger chunks,18,10,"[crystallized, ginger, chunks, bulk dried frui...",bulk,bulk dried fruits vegetables,"[crystal, ginger, chunk, bulk dried fruits veg...","[-0.001904256, -0.002966276, -0.010072536, -0...."
49685,30365,vegetable chips,18,10,"[vegetable, chips, bulk dried fruits vegetable...",bulk,bulk dried fruits vegetables,"[veget, chip, bulk dried fruits veget, bulk]","[-0.01041863, -0.012444427, -0.016481701, -0.0..."
49686,38007,naturally sweet plantain chips,18,10,"[naturally, sweet, plantain, chips, bulk dried...",bulk,bulk dried fruits vegetables,"[natur, sweet, plantain, chip, bulk dried frui...","[0.0021803195, -0.00030821623, -0.011328056, -..."


In [202]:
## Training the `Word2Vec` model
# The `Word2Vec` model is a shallow neural network that is trained to reconstruct linguistic contexts of words. <br>
# The model takes as input a large corpus of text and produces a vector space, typically of several hundred dimensions, with each unique word in the corpus being assigned a corresponding vector in the space. <br>
# Word vectors are positioned in the vector space such that words that share common contexts in the corpus are located in close proximity to one another in the space. <br>
# The model is trained by taking each sentence in the corpus, sliding a window of fixed size over it and trying to predict the word in the middle of the window, given the words on the sides of the window as input. <br>

# Defining the maximun window
window_max = max(products['products_lemma'].apply(lambda x:len(x)))

# size=20: In order to make `Word2Vec` a little bit quicker and for memory efficiency we're going to use 20 dimensions.
# window=49: In order to make sure all words are used in training the model, we're going to set a large.
w2vec_model = Word2Vec(list(products['products_lemma']), vector_size=vector_size, window=window_max, min_count=1, workers=-1)

### Vector calculation for products
# Loop through each product and obtain the average of each string that makes a product. <br>
# This will be the vector representation of the product. <br>
# The vector representation of the product will be used to calculate the similarity between products. <br>
# The similarity between products will be used to recommend products to the user. <br>

# Loop through each word in the product name to generate the vector.
prods_w2v = dict()
for row, product in tqdm(products.iterrows()):
    word_vector = list()
    for word in product['products_lemma']:
        word_vector.append(w2vec_model.wv[word])

    prods_w2v[product['product_id']] = np.average(word_vector, axis=0)

# Save vector values in list form to the dataframe.
products['vectors'] = prods_w2v.values()

49688it [00:01, 30440.11it/s]


In [203]:
products

Unnamed: 0,product_id,product_name,aisle_id,department_id,products_mod,department,aisle,products_lemma,vectors
0,1,chocolate sandwich cookies,61,19,"[chocolate, sandwich, cookies, cookies cakes, ...",snacks,cookies cakes,"[chocol, sandwich, cooki, cookies cak, snack]","[0.0048274104, -0.000982171, 0.0135972, 0.0127..."
1,78,nutter butter cookie bites go-pak,61,19,"[nutter, butter, cookie, bites, go, pak, cooki...",snacks,cookies cakes,"[nutter, butter, cooki, bite, go, pak, cookies...","[0.00053272134, -0.0016033737, 0.010740572, 0...."
2,102,danish butter cookies,61,19,"[danish, butter, cookies, cookies cakes, snacks]",snacks,cookies cakes,"[danish, butter, cooki, cookies cak, snack]","[0.007038258, -0.005356223, 0.013606298, 0.012..."
3,172,gluten free all natural chocolate chip cookies,61,19,"[gluten, free, all, natural, chocolate, chip, ...",snacks,cookies cakes,"[gluten, free, all, natur, chocol, chip, cooki...","[0.0021328337, 0.0015880795, 0.003318248, 0.00..."
4,285,mini nilla wafers munch pack,61,19,"[mini, nilla, wafers, munch, pack, cookies cak...",snacks,cookies cakes,"[mini, nilla, wafer, munch, pack, cookies cak,...","[0.0013800311, -0.0027976793, 0.014260448, 0.0..."
...,...,...,...,...,...,...,...,...,...
49683,22827,organic black mission figs,18,10,"[organic, black, mission, figs, bulk dried fru...",bulk,bulk dried fruits vegetables,"[organ, black, mission, fig, bulk dried fruits...","[0.0052806833, -0.00082850707, -0.00867612, -0..."
49684,28655,crystallized ginger chunks,18,10,"[crystallized, ginger, chunks, bulk dried frui...",bulk,bulk dried fruits vegetables,"[crystal, ginger, chunk, bulk dried fruits veg...","[-0.001904256, -0.002966276, -0.010072536, -0...."
49685,30365,vegetable chips,18,10,"[vegetable, chips, bulk dried fruits vegetable...",bulk,bulk dried fruits vegetables,"[veget, chip, bulk dried fruits veget, bulk]","[-0.01041863, -0.012444427, -0.016481701, -0.0..."
49686,38007,naturally sweet plantain chips,18,10,"[naturally, sweet, plantain, chips, bulk dried...",bulk,bulk dried fruits vegetables,"[natur, sweet, plantain, chip, bulk dried frui...","[0.0021803195, -0.00030821623, -0.011328056, -..."


In [6]:
train_orders = pd.read_csv("data/order_products__train.csv")
prior_orders = pd.read_csv("data/order_products__prior.csv")

In [7]:
train_orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [8]:
prior_orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [33]:
order_df = pd.concat([train_orders, prior_orders])
order_df = order_df.merge(products[['product_id', 'product_name']], on='product_id', how='left')

In [40]:
data = order_df.groupby('order_id').apply(lambda order:order['product_name'].tolist())
data_arr = data.values

In [115]:
%%time
sentence_df = order_df.groupby('order_id').apply(lambda order:order['product_id'].tolist())
sentence = sentence_df.values

CPU times: user 1min 3s, sys: 32.7 s, total: 1min 35s
Wall time: 1min 59s


In [118]:
longest = np.max(sentence_df.apply(len))

In [119]:
%%time
model = gensim.models.Word2Vec(sentence, vector_size=128, window=longest, min_count=2, workers = 4)

In [120]:
vocab = list(model.wv.key_to_index.keys())

In [122]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(model.wv.vectors)

In [123]:
# data.sample(5)

In [124]:
# [i for i in data_arr if "organic marinara pasta sauce" in i]

In [129]:
import random

In [171]:
pid = random.randrange(1,max(vocab))
pname = product_dict[pid]
print(pname)

similar = model.wv.most_similar(positive=pid, topn=10)
[(product_dict[i[0]], i[1]) for i in similar]

teriyaki chicken


[('mozzarella sticks', 0.8877257704734802),
 ('grilled chicken taquitos', 0.870657205581665),
 ('chicken enchilada casserole', 0.8661288619041443),
 ('pear halves', 0.8410782814025879),
 ('smoked pulled pork', 0.8347561359405518),
 ('petite dill pickles', 0.828694224357605),
 ('honey bbq glazed chicken wings', 0.8267912864685059),
 ('omeprazole acid reducer tablets', 0.8219950795173645),
 ('flame grilled beef patty', 0.8219634294509888),
 ('double chocolate muffins', 0.8195507526397705)]

In [174]:
model.wv.save("word2vec.wordvectors")

In [172]:
l = [i for i in data if 'teriyaki chicken' in i]

In [170]:
product_dict[i[0]]

'light and lean mattar paneer'

In [173]:
l

[['fat free milk',
  'seedless cucumbers',
  'bag of organic bananas',
  'shredded mexican blend cheese',
  'organic whole milk yogurt',
  'raw shrimp',
  'orange juice',
  'beef hot dogs',
  'teriyaki chicken'],
 ['small curd lowfat 2% milkfat cottage cheese', 'teriyaki chicken'],
 ['extralean ground turkey breast',
  'diet pepsi pack',
  'diet iced tea',
  'ready rice  pilaf',
  'ready rice roasted chicken flavored',
  'trop50 no pulp orange juice w/ calcium + vitamin d, 50% less sugar & calories',
  'shrimp & vegetable stir fry',
  'steamfresh premium selects broccoli florets',
  'deli counter turkey breast',
  "chef's favorites lightly seasoned asian medley",
  'teriyaki chicken'],
 ['boneless & skinless chicken breasts',
  'teriyaki chicken',
  'pineapple chunks',
  'organic crimini mushrooms'],
 ['organic baby spinach',
  'bag of organic bananas',
  'usda aa extra large eggs',
  'celery sticks',
  'shredded parmigiano reggiano',
  'teriyaki & pineapple chicken meatballs',
  'teri

In [68]:
def get_batch(vocab, model, n_batches=3):
    output = list()
    for i in range(0, n_batches):
        rand_int = np.random.randint(len(vocab), size=1)[0]
        suggestions = model.simi(positive=[vocab[rand_int]], topn=5)
        suggest = list()
        for i in suggestions:
            suggest.append(i[0])
        output += suggest
        output.append(vocab[rand_int])
    return output

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    """From Tensorflow's tutorial."""
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(21, 21))  #in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i,:]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [69]:
from matplotlib import pyplot as plt
embeds = []
labels = []
for item in get_batch(vocab, model, n_batches=5):
    embeds.append(pca.transform(model[item])[0])
    labels.append(products.loc[int(item)]['product_name'])
embeds = np.array(embeds)
plot_with_labels(embeds, labels)

AttributeError: 'Word2Vec' object has no attribute 'most_similar'

In [104]:
from gensim.test.utils import common_texts
from gensim.models import Phrases

In [106]:
# Train a bigram detector.
bigram_transformer = Phrases(common_texts)

# Apply the trained MWE detector to a corpus, using the result to train a Word2vec model.
# model = Word2Vec(bigram_transformer[common_texts], min_count=1)