# MIT-GSL Uruguay 

## January 2020

-----

# Week - 2 | Lesson - 03 
# NLP: Product embeddings

1. After introducing the concept of embeddings through NLP, we extend the notion of embeddings to other settings
2. Note that embeddings are dense continuous representations for discrete, sparse tokens - this makes embeddings widely applicable
3. We will use the concept of embeddings to understand the world of e-commerce better 

---

# Word2Vec and its Applications to Market-Basket Data

# Instacart Grocery Dataset

### Source: https://www.instacart.com/datasets/grocery-shopping-2017

1. Instacart is an online grocery delivery service
2. They have made available 3M grocery orders for over 200K users
3. They provide between 4 to 100 orders for each user and each order contains the sequence of products purchased
4. We also have a brief description of the products

### Overview:
1. We will use this data to build an understanding of word embeddings and investigate their application to downstream tasks
2. For this purpose, we will consider each purchase basket to be a sentence with an unordered sequence of words

---

# 0. Import Modules

In [2]:
# ==============================================
# 0. Module imports
# ==============================================

import pandas as pd
pd.options.display.max_colwidth = 100
import numpy as np
from itertools import product
import csv

# w2v
import gensim

# text processing
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import ToktokTokenizer
import string
import re # regular expressions
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity

# T-Sne
#import umap
#from openTSNE import TSNE, TSNEEmbedding, affinity, initialization
#from openTSNE import initialization
#from openTSNE.callbacks import ErrorLogger
from sklearn.manifold import TSNE


# utils
from sklearn import utils


# parallel processing
import multiprocessing
from joblib import delayed, Parallel

# time code
import time

# 2-d visualiztion
%matplotlib inline
from ggplot import *
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import os
#os.chdir("/pool001/madhavk/gsl-uruguay/W-02-NLP/")

In [4]:
# =========================================================
# set directories
# =========================================================

import os
EC2 = True  # If using EC2 (for data directory/paths)

# Select path based off of local or remote
if not EC2:
    wd = "/pool001/madhavk/gsl-uruguay/W-02-NLP/"
else:
    wd = "/home/ubuntu/machine_learning_aws/"
os.chdir(wd)

EC2 = True  # If using EC2 (for data directory/paths)
if not EC2:
    # raw data
    raw_data_dir = "nlp-data/in-grocery/instacart_2017_05_01/"
    # processed data
    process_dir = "nlp-data/in-grocery/prepared-data/"  
else:
    # raw data
    raw_data_dir = "data/in-grocery/instacart_2017_05_01"
    # processed data
    process_dir = "data/in-grocery/prepared-data/"

In [5]:
# =========================================================
# processed files
# =========================================================

os.listdir(process_dir)

['orders-split-v1.csv',
 'all-orders-wide-v1.csv',
 'all-orders-long-v1.csv',
 'products-merged-v1.csv']

In [6]:
# =========================================================
# global parameters
# =========================================================

# show entire value of cell in pandas
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

# number of cpus
cpus = multiprocessing.cpu_count()
f"Number of CPUs: {cpus}"

'Number of CPUs: 4'

------

# 1. Import grocery data

## 1.1. Order level data

In [7]:
# =========================================================
# order-level data
# =========================================================

orders_wide = pd.read_csv(process_dir + "all-orders-wide-v1.csv")
# This data set has one row per order with the products ordered in the product_id column. 
# Products are separated by space.
print(orders_wide.shape)

(3152555, 4)


In [8]:
display(orders_wide.head(10))

Unnamed: 0,order_id,product_id,num_products,eval
0,2,33120 28985 9327 45918 30035 17794 40141 1819,8,prior
1,3,33754 24838 17704 21903 17668 46667 17461 32665,8,prior
2,4,46842 39758 27761 10054 21351 22598 34862 40285 17616 25146 32645 41276,12,prior
3,5,13176 47329 27966 23909 48370 13245 27360 6348 40878 6184 48002 20914 37011 12962 45698 41176 48366 47209 46522 38693 48825 8479,22,prior
4,7,34050 46802,2,prior
5,9,21405 47890 11182 2014 29193 34203 14992 31506 23288 44533 18362 432 3990 14183,14,prior
6,10,24852 4796 31717 47766 4605 1529 21137 22122 34134 27156 14992 49235 26842 3464 25720,15,prior
7,11,30162 5994 1313 31506,4,prior
8,12,30597 15221 43772 37886 37215 34335 26910 38888 38050 29471,10,prior
9,13,17330 27407 35419 196 44635 26878 25783 41290 33198 23020 36086 3800 25952,13,prior


In [9]:
# =========================================================
# orders meta-data
# =========================================================

orders_meta = pd.read_csv(process_dir + "orders-split-v1.csv")
# This dataset includes the meta data for each order, i.e., the user who ordered it, order day of the week, order time
print(orders_meta.shape)

(3346083, 7)


In [10]:
display(orders_meta.head(10))

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,eval
0,2539329,1,1,2,8,,prior
1,2398795,1,2,3,7,15.0,prior
2,473747,1,3,3,12,21.0,prior
3,2254736,1,4,4,7,29.0,prior
4,431534,1,5,4,15,28.0,prior
5,3367565,1,6,2,7,19.0,prior
6,550135,1,7,1,9,20.0,prior
7,3108588,1,8,1,14,14.0,prior
8,2295261,1,9,1,16,0.0,prior
9,2550362,1,10,4,8,30.0,prior


----

## 1.2. Merge train-val-test split

In [11]:
# =========================================================
# merge orders-wide and orders-meta
# =========================================================

orders_wide.drop(labels = "eval", axis = 1, inplace = True)
orders_wide = pd.merge(orders_wide, # data - 1
                       orders_meta[["order_id", "user_id", "eval"]], # data - 2
                       on = "order_id", # merge key
                       how = "left") # left join
print(orders_wide.shape)

(3152555, 5)


In [12]:
display(orders_wide.head(10))

Unnamed: 0,order_id,product_id,num_products,user_id,eval
0,2,33120 28985 9327 45918 30035 17794 40141 1819,8,202279,prior
1,3,33754 24838 17704 21903 17668 46667 17461 32665,8,205970,prior
2,4,46842 39758 27761 10054 21351 22598 34862 40285 17616 25146 32645 41276,12,178520,prior
3,5,13176 47329 27966 23909 48370 13245 27360 6348 40878 6184 48002 20914 37011 12962 45698 41176 48366 47209 46522 38693 48825 8479,22,156122,prior
4,7,34050 46802,2,142903,prior
5,9,21405 47890 11182 2014 29193 34203 14992 31506 23288 44533 18362 432 3990 14183,14,139016,prior
6,10,24852 4796 31717 47766 4605 1529 21137 22122 34134 27156 14992 49235 26842 3464 25720,15,135442,prior
7,11,30162 5994 1313 31506,4,143742,prior
8,12,30597 15221 43772 37886 37215 34335 26910 38888 38050 29471,10,152610,prior
9,13,17330 27407 35419 196 44635 26878 25783 41290 33198 23020 36086 3800 25952,13,45082,prior


In [13]:
# =========================================================
# eval-set distribution
# =========================================================

orders_wide["eval"].value_counts()

prior    2959079
train    116333 
val      38701  
test     38442  
Name: eval, dtype: int64

In [14]:
# clear some space
del orders_meta

----

## 1.3. Import product info data

In [15]:
products = pd.read_csv(process_dir + "products-merged-v1.csv")
print(products.shape)

(49688, 6)


In [16]:
display(products.head(10))

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,2,All-Seasons Salt,104,13,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce,38,1,frozen meals,frozen
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry
5,6,Dry Nose Oil,11,11,cold flu allergy,personal care
6,7,Pure Coconut Water With Orange,98,7,juice nectars,beverages
7,8,Cut Russet Potatoes Steam N' Mash,116,1,frozen produce,frozen
8,9,Light Strawberry Blueberry Yogurt,120,16,yogurt,dairy eggs
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,water seltzer sparkling water,beverages


In [17]:
#===============================================
# top departments
#===============================================
products["department"].value_counts()

personal care      6563
snacks             6264
pantry             5371
beverages          4365
frozen             4007
dairy eggs         3449
household          3085
canned goods       2092
dry goods pasta    1858
produce            1684
bakery             1516
deli               1322
missing            1258
international      1139
breakfast          1115
babies             1081
alcohol            1054
pets               972 
meat seafood       907 
other              548 
bulk               38  
Name: department, dtype: int64

-----

### In-class exercise

In [20]:
#===============================================
# top aisles
#===============================================
# can you figure out which aisles host the most number of products?
products["aisle"].value_counts().head(6)

missing                 1258
candy chocolate         1246
ice cream ice           1091
vitamins supplements    1038
yogurt                  1026
chips pretzels          989 
Name: aisle, dtype: int64

---

# 2. Data exploration

In [21]:
# long-form of orders
orders_long = pd.read_csv(process_dir + "all-orders-long-v1.csv")
print(orders_long.shape)

(32019330, 5)


In [22]:
# top-20 observations in the data frame
display(orders_long.head(20))

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,eval
0,2,33120,1,1,prior
1,2,28985,2,1,prior
2,2,9327,3,0,prior
3,2,45918,4,1,prior
4,2,30035,5,0,prior
5,2,17794,6,1,prior
6,2,40141,7,1,prior
7,2,1819,8,1,prior
8,3,33754,1,1,prior
9,3,24838,2,1,prior


## 2.1. Most frequently purchased products

In [23]:
# most frequently bought products
most_freq_purchased = pd.DataFrame(orders_long["product_id"].value_counts()) # count the number times each product-id appears in data frame
most_freq_purchased.reset_index(drop = False, inplace = True) # complying with pandas indexing 
most_freq_purchased.columns = ["product_id", "freq"] # assign column names
display(most_freq_purchased.head(10))

Unnamed: 0,product_id,freq
0,24852,491291
1,13176,394930
2,21137,275577
3,21903,251705
4,47209,220877
5,47766,184224
6,47626,160792
7,16797,149445
8,26209,146660
9,27845,142813


In [24]:
# merge with product info from the meta dataset
most_freq_purchased = pd.merge(most_freq_purchased, 
                               products, 
                               on = "product_id", 
                               how = "left")
display(most_freq_purchased.head(20))

Unnamed: 0,product_id,freq,product_name,aisle_id,department_id,aisle,department
0,24852,491291,Banana,24,4,fresh fruits,produce
1,13176,394930,Bag of Organic Bananas,24,4,fresh fruits,produce
2,21137,275577,Organic Strawberries,24,4,fresh fruits,produce
3,21903,251705,Organic Baby Spinach,123,4,packaged vegetables fruits,produce
4,47209,220877,Organic Hass Avocado,24,4,fresh fruits,produce
5,47766,184224,Organic Avocado,24,4,fresh fruits,produce
6,47626,160792,Large Lemon,24,4,fresh fruits,produce
7,16797,149445,Strawberries,24,4,fresh fruits,produce
8,26209,146660,Limes,24,4,fresh fruits,produce
9,27845,142813,Organic Whole Milk,84,16,milk,dairy eggs


----

### In-class exercise

In [25]:
# Can you figure out the least popular products?
display(most_freq_purchased.tail(20))

Unnamed: 0,product_id,freq,product_name,aisle_id,department_id,aisle,department
14320,15612,206,Unwrapped Bites,45,19,candy chocolate,snacks
14321,20434,206,Sardines in Olive Oil,95,15,canned meat seafood,canned goods
14322,22315,206,Slow Churned Chocolate Light Ice Cream,37,1,ice cream ice,frozen
14323,39328,206,Organic Mexican Chocolate Ice Cream,37,1,ice cream ice,frozen
14324,12549,206,"Salad Dressing, Bleu Cheese, Vegan",89,13,salad dressing toppings,pantry
14325,35355,205,94% Fat Free Microwave Butter Popcorn,23,19,popcorn jerky,snacks
14326,5634,205,Organic Pasture Raised Local Eggs,86,16,eggs,dairy eggs
14327,18975,205,Uncured Cherry Smoked Bacon,106,12,hot dogs bacon sausage,meat seafood
14328,20814,205,Sensitive with Iron Infant Formula,92,18,baby food formula,babies
14329,46815,205,Ultimate Omega 1280mg omega-3,47,11,vitamins supplements,personal care


In [26]:
# most popular aisle
aisle = most_freq_purchased.groupby("aisle").agg({"freq": "sum"})
aisle = aisle.reset_index(drop = False, inplace = False).sort_values("freq", ascending = False)
display(aisle)

Unnamed: 0,aisle,freq
50,fresh fruits,3782826
53,fresh vegetables,3553596
98,packaged vegetables fruits,1823815
133,yogurt,1475395
93,packaged cheese,990483
83,milk,918611
131,water seltzer sparkling water,868265
25,chips pretzels,717827
119,soy lactosefree,655592
11,bread,588261


----

### In-class exercise

In [29]:
# Can you figure out the most popular department, i.e., the department from where most products are purchased?
dpt = most_freq_purchased.groupby("department").agg({"freq": "sum"})
dpt = dpt.reset_index(drop = False, inplace = False).sort_values("freq", ascending = False)
display(dpt)

Unnamed: 0,department,freq
19,produce,9840710
7,dairy eggs,5522579
20,snacks,2767610
3,beverages,2641843
10,frozen,2178717
16,pantry,1763171
2,bakery,1168021
8,deli,1046768
6,canned goods,1032645
9,dry goods pasta,833181


In [96]:
# most purchased department

-----

## 2.2. Co-purchased products

In [30]:
# ===========================================
# Generate co-purchase matrix
# ===========================================

def CoPurchaseMatrix(orders_wide, product_info = True):
    '''
    orders_wide is order-level data with one row per order
    '''
    count_vec = CountVectorizer(ngram_range = (1,1), binary = True, 
                            token_pattern = "\\b\\w+\\b") # sku counts
    pur_mat = count_vec.fit_transform(orders_wide["product_id"])
    co_pur_mat = (pur_mat.T * pur_mat) #copurchase matrix
    co_pur_mat.setdiag(0) # set diagonal to 0
    co_pur_mat_df = pd.DataFrame(co_pur_mat.todense()) # convert to data frame
    co_pur_mat_df.index = count_vec.vocabulary_ # row names from sku-ids
    co_pur_mat_df.columns = count_vec.vocabulary_ # column names from sku-ids
    co_pur_mat_df = co_pur_mat_df.where(np.triu(np.ones(co_pur_mat_df.shape)).astype(np.bool)) # consider the uppre tri
    co_pur_mat_df = co_pur_mat_df.stack().reset_index() # melt to sku-1 and sku-2 per row
    co_pur_mat_df.columns = ["product_id_1", "product_id_2", "copur"]
    co_pur_mat_df = co_pur_mat_df.loc[co_pur_mat_df["copur"] > 0, :] # subset for copur > 0
    co_pur_mat_df_top = co_pur_mat_df.sort_values(["product_id_1", "copur"], ascending = False)
    co_pur_mat_df_top = co_pur_mat_df_top.drop_duplicates(["product_id_1"], keep = "first")
    co_pur_mat_df_top["product_id_1"] = co_pur_mat_df_top["product_id_1"].astype(int) # fix data types
    co_pur_mat_df_top["product_id_2"] = co_pur_mat_df_top["product_id_2"].astype(int) # fix data types
    co_pur_mat_df_top = co_pur_mat_df_top.sort_values(["copur"], ascending = False).reset_index(drop = True)
    if product_info:
        co_pur_mat_df_top = pd.merge(co_pur_mat_df_top, products, how = "left", 
                                     left_on = "product_id_1", right_on = "product_id")
        co_pur_mat_df_top.drop("product_id", axis = 1, inplace = True)
        co_pur_mat_df_top = pd.merge(co_pur_mat_df_top, products, how = "left",
                                     left_on = "product_id_2", right_on = "product_id", 
                                     suffixes = ["_1", "_2"])
        co_pur_mat_df_top.drop("product_id", axis = 1, inplace = True)
        col_order = ['product_id_1', 'product_id_2', 'copur', 'product_name_1', 'product_name_2',
                             'aisle_1', 'aisle_2', 'department_1', 'department_2', 
                             'aisle_id_1', 'aisle_id_2', 'department_id_1', 'department_id_2']
        co_pur_mat_df_top = co_pur_mat_df_top[col_order]
    return(co_pur_mat_df_top)

In [31]:
# calculate copurchases
copur = CoPurchaseMatrix(orders_wide = orders_wide, product_info = True)
print(copur.shape)

(14332, 13)


In [99]:
display(copur[["product_name_1", "product_name_2", "copur", "aisle_1", "aisle_2"]].head(20))

Unnamed: 0,product_name_1,product_name_2,copur,aisle_1,aisle_2
0,Organic Lacinato (Dinosaur) Kale,Soft Eating Strawberry Flavored Licorice,64761.0,fresh vegetables,candy chocolate
1,"Mighty 4 Sweet Potato, Blueberry, Millet & Greek Yogurt Tots Snack",Nut Delight Fruit & Nut Bar,58330.0,baby food formula,energy granola bars
2,Nut Delight Fruit & Nut Bar,Carrot Bunch,55611.0,energy granola bars,fresh vegetables
3,Natural Finely Shredded Triple Cheddar Cheese,Nut Delight Fruit & Nut Bar,53395.0,packaged cheese,energy granola bars
4,Italian Style Meatballs & Mozzarella Sandwiches,Nut Delight Fruit & Nut Bar,43180.0,frozen meals,energy granola bars
5,Organic Granny Smith Apples,Garlic Spice Blend Paste,28998.0,fresh fruits,packaged vegetables fruits
6,Pecan Pie Fruit & Nut Food Bar,Soft Eating Strawberry Flavored Licorice,26812.0,energy granola bars,candy chocolate
7,Garlic Spice Blend Paste,Carrot Bunch,25766.0,packaged vegetables fruits,fresh vegetables
8,"Almond Coconut Bar, Organic",2nd Foods Organic Pear and Spinach Baby Food,22923.0,energy granola bars,baby food formula
9,Soft Eating Strawberry Flavored Licorice,Spicy Minis Guacamole,22027.0,candy chocolate,fresh dips tapenades


----

# 3. Split train-val-test datasets


In [32]:
#===============================================
# split train-val-test
#===============================================

train = orders_wide.loc[orders_wide["eval"].isin(["prior", "train"]), :]
val = orders_wide.loc[orders_wide["eval"] == "val", :]
test = orders_wide.loc[orders_wide["eval"] == "test", :]
print("train size:", train.shape)
print("val size:", val.shape)
print("test size:", test.shape)

train size: (3075412, 5)
val size: (38701, 5)
test size: (38442, 5)


In [38]:
# clear more space
del orders_wide, orders_long

In [39]:
display(train.head())

Unnamed: 0,order_id,product_id,num_products,user_id,eval
0,2,33120 28985 9327 45918 30035 17794 40141 1819,8,202279,prior
1,3,33754 24838 17704 21903 17668 46667 17461 32665,8,205970,prior
2,4,46842 39758 27761 10054 21351 22598 34862 40285 17616 25146 32645 41276,12,178520,prior
3,5,13176 47329 27966 23909 48370 13245 27360 6348 40878 6184 48002 20914 37011 12962 45698 41176 48366 47209 46522 38693 48825 8479,22,156122,prior
4,7,34050 46802,2,142903,prior


## 3.1. Random sample for faster processing

In [40]:
#===============================================
# randomly sample training data
#===============================================

sample_size = 1000000
train = train.sample(n = sample_size)
train = train.reset_index(drop = True)
print(train.shape)

(1000000, 5)


In [41]:
display(train.head(10))

Unnamed: 0,order_id,product_id,num_products,user_id,eval
0,2572955,19836 5069 27104 42356 44234 26709 30257 28433 3801 24852 31395 45646 34050,13,178767,prior
1,924677,4920 46650 27307 24852 24841 8518 34969,7,34856,prior
2,132114,45541 38679 27845 36011 24852 27966 21137 25832,8,130089,prior
3,2579730,24852 28204 41950 37107 6046 49683 330 41596 24184 20114 26800 34450 3464 47008 22260 44303 248 16349 27521 25659 21903 38293 27104,23,96344,prior
4,1489300,43122 22825 37646 24954 33754 14437,6,154385,prior
5,2046312,38003 21333 33846 28986 46842 18418 16818 2151 24571,9,55063,prior
6,901490,24838 42768 27521 47626 22935 47209 8277 24184 46802 17948 40377 3142 4781 25126 46064,15,147251,prior
7,134163,10504 16797 40042 10960 24787 5373 25647,7,136001,prior
8,1043054,27845 40174 31683 12572 12258 33000 12206 41198 37317 33198 6182,11,39685,prior
9,1673393,47862 24964 49156,3,202319,prior


## 3.2. Tokenize sentences

In [42]:
# start pool process for parallel procressing
pool = multiprocessing.Pool(processes = cpus)

In [45]:
# text processing
from nltk import sent_tokenize, word_tokenize

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [46]:
# training sentences
t0 = time.time()
train_orders = pool.map(word_tokenize, train["product_id"])
t1 = time.time()
print(len(train_orders))

1000000


In [47]:
# time taken
print(f"Time Taken: {t1 - t0}")

Time Taken: 63.63620114326477


In [48]:
# training orders
train_orders[0:3]

[['19836',
  '5069',
  '27104',
  '42356',
  '44234',
  '26709',
  '30257',
  '28433',
  '3801',
  '24852',
  '31395',
  '45646',
  '34050'],
 ['4920', '46650', '27307', '24852', '24841', '8518', '34969'],
 ['45541', '38679', '27845', '36011', '24852', '27966', '21137', '25832']]

In [49]:
# validation sentences
t0 = time.time()
val_orders = pool.map(word_tokenize, val["product_id"])
t1 = time.time()
print(len(val_orders))

38701


----

### In-class exercise

In [111]:
# what do the first three validation orders look like?

In [112]:
# Can you similarly tokenize the test sentences?
#print(len(test_orders))

----

# 4. Word2Vec sample model

## 4.1. Define and train model

In [60]:
print(type(train_orders))
print(len(train_orders))

<class 'list'>
1000000


In [50]:
#===============================================
# define and train model
#===============================================

w2v_1 = gensim.models.Word2Vec(sentences = train_orders,
                               workers = multiprocessing.cpu_count(),
                               seed = 1234)

## Model properties

In [51]:
#===============================================
# vocabulary
#===============================================

# vocabulary length
f"Vocab length: {len(w2v_1.wv.vocab)}"

'Vocab length: 14340'

In [52]:
# sample vocabulary
list(w2v_1.wv.vocab.keys())[0:5]

['19836', '5069', '27104', '42356', '44234']

In [53]:
# length of corpus
print(w2v_1.corpus_count)

1000000


In [54]:
# number of iterations
w2v_1.iter

5

In [55]:
#===============================================
# product vectors
#===============================================

# enter product-id
prod_id = "1"
print(w2v_1[prod_id].shape)
print("------------")
print("------------")
print(w2v_1[prod_id])

(100,)
------------
------------
[-4.04510736e-01  5.81333399e-01  4.53743726e-01 -3.50309789e-01
  1.01281011e+00 -4.07895654e-01 -5.56387961e-01  5.87131798e-01
  8.55092287e-01  9.39807117e-01  7.45321512e-01 -8.14930975e-01
 -8.72512102e-01  7.91543871e-02  9.06610668e-01 -3.82677168e-01
 -4.74929512e-01 -5.41775189e-02 -2.49912918e-01 -1.83316660e+00
 -8.09821561e-02  8.36983323e-01 -4.07992959e-01  1.27453935e+00
  6.72552466e-01 -1.81895447e+00 -1.16033173e+00  9.22205448e-01
 -1.01271415e+00 -4.68485922e-01 -8.93574238e-01  1.16683180e-02
 -4.44470018e-01 -4.93991613e-01  5.31884909e-01  4.11142975e-01
 -8.66689801e-01  8.19667816e-01 -8.06315720e-01 -9.77191567e-01
 -5.36102057e-01  2.46548280e-01  2.07711667e-01  4.12979960e-01
  8.24673533e-01  1.07451487e+00 -4.57732707e-01 -7.52725959e-01
 -4.10204858e-01 -8.96955729e-02  3.86765540e-01 -3.32888126e-01
 -4.38780040e-01  1.95192918e-01 -9.68766034e-01  1.15196693e+00
  3.79069000e-01 -5.19667804e-01  8.28152597e-01  1.85116

## 4.2. Update embeddings

In [56]:
#===============================================
# update model weights
#===============================================
w2v_1.train(sentences = train_orders, total_examples = w2v_1.corpus_count, epochs = 2)

(19314833, 20196396)

## 4.3. Inspect model output

In [57]:
#===============================================
# upadted product vectors
#===============================================

# enter product-id
prod_id = "1"
print(w2v_1[prod_id].shape)
print("------------")
print("------------")
print(w2v_1[prod_id])

(100,)
------------
------------
[-0.7753562   1.101924    0.79193944 -0.1209721   0.9255502  -0.49209473
 -0.5375001   0.66311926  1.2073712   1.0305341   0.8318689  -0.92228687
 -1.0315843  -0.13880645  0.74502397 -0.19811006 -0.31725487 -0.27960244
 -0.27866358 -1.96529     0.17912069  0.7192371  -0.22476996  1.4866871
  0.8997672  -2.232609   -1.09567     1.2788435  -0.9064601  -0.9304165
 -0.79422253 -0.09606735 -0.47052908 -0.4087471   0.67744124  0.01524378
 -1.0514189   1.0494012  -0.72372305 -1.2782072  -0.4718718   0.09444378
  0.45311087  0.68848443  0.7339318   1.3622518  -0.67777956 -0.6778548
 -0.52416694 -0.17035656  0.34380886 -0.34908834 -0.53268635  0.17979494
 -0.9853808   1.1113174   0.6251342  -0.37770325  0.66359603  1.9883057
  0.06227816 -1.2932522  -0.1873026  -0.52454215  0.5209079   0.1737606
 -0.0990522  -0.10883834  0.18628901 -0.3774401   0.32058585 -0.06658533
  0.03432867  0.40749118 -1.217024   -0.6472818   0.39767978  0.27677667
  0.4414988  -0.6221793

--------

# 5. Improve W2V model

In [61]:
#===============================================
# model parameters
#===============================================

# size of embedding matrix
emb_size = 100

# context window size
cxt_window = 10

# batch size for gradient update
batch_size = 10000

# down-sample high frequency words
hfs = 0.001

# learning rate
lr = 0.05

In [62]:
#===============================================
# define model
#===============================================

w2v_1 = gensim.models.Word2Vec(sentences = train_orders,
                               size = emb_size, # number of columns in embedding matrix
                               hs = 1, # hierarchical softmax
                               negative = 0, # negative sampling
                               window = cxt_window, # context window
                               min_count = 1, # minimum frequency count
                               batch_words = batch_size, # batch size for update
                               alpha = lr, # learning rate
                               sample = hfs, # down sample high frequency words 
                               workers = cpus,
                               seed = 1234)

## 5.1. Score on validation and test

In [69]:
#===============================================
# Score model to get log-likelihood
#===============================================

def ScoreW2V(test_sent, model, normalize = True, avg_over_sent = True):
    test_score = model.score(test_sent, total_sentences = len(test_sent), 
                             chunksize = 100,
                             queue_factor = 2,
                             report_delay = 1)
    if normalize:
        test_score = [test_score[x]/len(test_sent[x]) for x in range(len(test_sent))]
    else:
        test_score = list(test_score)
    if avg_over_sent:
        test_score = np.mean(test_score)
    return test_score

In [64]:
w2v_1_val = ScoreW2V(test_sent = val_orders, model = w2v_1, normalize = True, avg_over_sent = True)

In [65]:
w2v_1_val

-7.527941547609724

## 5.2. Similar products

In [66]:
#===============================================
# similar products
#===============================================

# sample product
prod_id = "10"

# product info for sample product
display(products.loc[products["product_id"].isin([prod_id]), :])

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department


In [67]:
#===============================================
# similarity from model
#===============================================

w2v_1.wv.most_similar(prod_id)

[('44375', 0.7991302609443665),
 ('6475', 0.6662318110466003),
 ('4138', 0.6114770174026489),
 ('4493', 0.5634533762931824),
 ('17080', 0.5379359126091003),
 ('8192', 0.4694351255893707),
 ('32380', 0.4646366238594055),
 ('28004', 0.44108134508132935),
 ('19125', 0.42980924248695374),
 ('21006', 0.413822740316391)]

In [128]:
#===============================================
# lookup product info
#===============================================
most_similar_prods = [y[0] for y in w2v_1.wv.most_similar(positive = prod_id)]
most_similar_prods = products.loc[products["product_id"].isin(most_similar_prods), :]
display(most_similar_prods)

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
455,456,Sparkling Blueberry,77,7,soft drinks,beverages
4137,4138,Arancita Rossa,77,7,soft drinks,beverages
4492,4493,Italian Sparkling Pomegranate And Orange Soda,115,7,water seltzer sparkling water,beverages
5882,5883,Organic Strawberry Lemonade,98,7,juice nectars,beverages
6474,6475,Limonata Sparkling Lemon Beverage,77,7,soft drinks,beverages
14765,14766,Sparkling Grapefruit,77,7,soft drinks,beverages
17079,17080,Clementina Sparkling Beverage,115,7,water seltzer sparkling water,beverages
18388,18389,Blood Orange Italian Soda,77,7,soft drinks,beverages
28003,28004,Limonata Sparkling Beverage,115,7,water seltzer sparkling water,beverages
44374,44375,Canned Aranciata Orange,77,7,soft drinks,beverages


In [129]:
#===============================================
# most dissimilar products
#===============================================
most_dissimilar_prods = [y[0] for y in w2v_1.wv.most_similar(negative = [prod_id])]
most_dissimilar_prods = products.loc[products["product_id"].isin(most_dissimilar_prods), :]
display(most_dissimilar_prods)

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
951,952,"Flaxseed Meal, Whole Ground",17,13,baking ingredients,pantry
1246,1247,Sloppy Joes Seasoning Mix,5,13,marinades meat preparation,pantry
17510,17511,Crunchy Seven Nut & Seed Butter,88,13,spreads,pantry
17595,17596,Fruity Cheerios Cereal,121,14,cereal,breakfast
18170,18171,Natural Sunflower Spread,88,13,spreads,pantry
22039,22040,Fresh Cut Blue Lake No Salt Added Cut Green Beans,81,15,canned jarred vegetables,canned goods
34853,34854,"Steamfresh Selects Frozen Broccoli, Cauliflower & Carrots",116,1,frozen produce,frozen
40375,40376,Steamfresh Sweet Peas,116,1,frozen produce,frozen
40620,40621,Broccoli & Cauliflower,116,1,frozen produce,frozen
45964,45965,Steel Cut Oats,130,14,hot cereal pancake mixes,breakfast


----

### In-class exercise

In [130]:
# find the most similar products to 
prod_id = "100"

# product info for sample product
display(products.loc[products["product_id"].isin([prod_id]), :])

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
99,100,Peanut Butter & Strawberry Jam Sandwich,38,1,frozen meals,frozen


In [131]:
# which products are most similar to peanut better and strawberry jam sandwich

In [132]:
# what about the most dissimilar products?

----

# 5. Product maps using t-sne 

## 5.1. Extract all product vectors

In [133]:
#===============================================
# product vectors
#===============================================

items = [key for key in w2v_1.wv.vocab.keys()] # all product-ids
word_vec = [list(w2v_1[item]) for item in items] # "word" vector for each product id
word_vec = np.array(word_vec) # convert to array
word_vec_df = pd.DataFrame(word_vec) # convert to data frame
w2v_vec_names = ["wv" + str(x + 1) for x in range(word_vec_df.shape[1])] # column names
word_vec_df.columns = w2v_vec_names # assign column names
word_vec_df["product_id"] = items # include product id in data frame
word_vec_df["product_id"] = word_vec_df["product_id"].astype(int) # convert to type integer for later merge
word_vec_df = word_vec_df[["product_id"] + w2v_vec_names] # re-order columns
print(word_vec_df.shape)

(14340, 101)


In [134]:
display(word_vec_df.head(10))

Unnamed: 0,product_id,wv1,wv2,wv3,wv4,wv5,wv6,wv7,wv8,wv9,wv10,wv11,wv12,wv13,wv14,wv15,wv16,wv17,wv18,wv19,wv20,wv21,wv22,wv23,wv24,wv25,wv26,wv27,wv28,wv29,wv30,wv31,wv32,wv33,wv34,wv35,wv36,wv37,wv38,wv39,wv40,wv41,wv42,wv43,wv44,wv45,wv46,wv47,wv48,wv49,wv50,wv51,wv52,wv53,wv54,wv55,wv56,wv57,wv58,wv59,wv60,wv61,wv62,wv63,wv64,wv65,wv66,wv67,wv68,wv69,wv70,wv71,wv72,wv73,wv74,wv75,wv76,wv77,wv78,wv79,wv80,wv81,wv82,wv83,wv84,wv85,wv86,wv87,wv88,wv89,wv90,wv91,wv92,wv93,wv94,wv95,wv96,wv97,wv98,wv99,wv100
0,27888,0.991016,-0.055347,-0.618443,-0.112379,0.84573,-0.161779,-0.23541,-0.780843,-0.411044,1.131562,0.257362,-0.02304,0.325801,-1.278199,0.935081,0.000162,0.693156,1.378203,-0.0971,1.450118,-0.849971,0.208069,0.172751,0.453095,-0.224899,0.798142,-0.861846,0.218438,-2.469534,-1.191987,0.312043,1.023138,-0.282084,-0.693851,0.873696,-0.546057,-1.030167,-0.544695,-0.800045,0.457829,0.370236,0.657401,-0.654934,-0.689802,-0.332016,-0.374241,0.932325,-1.502272,-0.258942,0.085221,-0.035942,-0.226732,0.716972,0.753459,0.702796,-0.443695,-0.01027,-0.230515,0.254941,0.204216,0.535912,-0.457416,-0.701698,-1.84158,0.732887,-0.750403,-0.144997,0.118411,-0.243787,-1.045582,1.04945,-0.56225,0.684943,0.306815,0.70936,-0.453412,0.323479,-1.063863,0.113524,-1.039436,-0.081299,1.041267,-0.461687,-0.756298,-0.168442,0.976666,-0.11287,0.614994,-0.269176,-0.582358,0.928558,-1.4949,-0.043209,0.213914,1.543325,-0.165199,-1.53197,-1.706078,0.239023,-0.962737
1,21137,0.48894,-1.071994,-0.169053,0.005896,0.645513,0.226873,0.00835,-0.393018,-0.112132,0.418457,0.209269,0.490734,0.599445,0.400796,-0.235424,0.461379,-0.19841,-0.260389,-0.210395,0.083492,-0.417034,0.250042,-0.369893,0.436977,0.134372,0.377746,-1.016585,0.177668,0.142176,-0.281023,0.159387,0.085806,0.428145,-0.171543,0.399599,0.580232,0.023975,-0.551147,0.521113,0.201741,0.457943,0.602929,-0.45098,-0.309515,-0.022506,0.562622,-0.36606,-0.868004,-0.482,0.020069,0.228861,-0.100942,-0.268868,0.26037,-0.066975,-0.480731,-0.036262,0.618339,-0.224918,0.118141,0.027317,-0.581062,-0.175178,-0.592972,-0.452428,-0.660195,-0.810445,-0.743833,0.263803,0.820481,-0.176059,-0.167931,0.008489,0.06438,-0.28438,0.423906,0.074715,-0.066025,-0.053279,-0.015526,0.540443,0.221875,0.008151,-0.161192,-0.36198,-0.481994,-0.482474,0.057734,-0.114625,-0.342058,-0.060914,0.56112,0.252346,0.225239,0.093766,-0.403118,-0.465855,0.036376,0.202066,-0.187435
2,46979,0.585247,-0.580192,0.229223,-0.557701,-0.21262,-0.392506,-0.76817,-0.566538,-0.408713,-0.56611,0.284612,0.372882,-0.368035,0.246818,0.441033,0.187344,0.093674,0.079909,0.107954,0.007604,-0.041029,0.614836,-0.199368,0.357653,0.013743,-0.256645,0.367435,-0.270438,0.148032,-0.07488,-0.037992,-0.25179,0.129986,0.213501,0.704362,-0.478214,-0.416858,0.273656,-0.275017,-0.99856,-0.591249,0.595047,0.185176,-0.174462,-0.069881,-0.122427,-0.934116,0.034601,-0.609276,0.132374,-0.003258,0.282898,-0.673007,-0.262482,-0.460662,-0.609691,0.02373,0.180362,-0.152889,0.327437,0.308289,0.127156,0.671862,1.021824,-0.541045,0.595015,-0.235154,0.191937,0.248536,0.452162,-0.27858,0.155287,0.165723,0.139932,0.121873,0.442441,-0.505026,-0.658693,0.373433,-0.450443,0.104672,0.409075,0.586718,0.233794,-0.28245,0.235543,-0.059182,-0.401147,0.046142,-0.032179,0.224874,-0.05454,-0.246128,-0.104254,-0.178707,0.250769,-0.332059,-0.043698,-0.269527,-0.413866
3,21903,0.434819,-0.224153,0.012649,-0.553659,-0.075412,-0.257416,-0.385184,-0.547945,0.124002,0.070073,0.032035,-0.00186,-0.308162,-0.003908,-0.380619,0.680212,-0.089089,-0.058465,0.005208,-0.445409,-0.342089,0.757735,-0.251449,0.003823,-0.08607,0.441477,-0.338572,-0.219321,0.281067,-0.295155,-0.480073,-0.257283,0.112104,0.463972,0.371576,0.188066,0.082587,0.278639,-0.30279,-0.650368,-0.77015,0.428568,-0.357414,-0.210241,0.194125,-0.463022,-0.677743,-0.206159,-0.182604,0.163094,0.043676,-0.274664,-0.396438,0.023866,-0.10376,-0.447145,-0.123595,0.730513,0.02989,0.597319,0.066663,0.476858,0.098387,-0.063495,0.087863,-0.125483,-0.467052,-0.429125,0.406182,-0.378713,-0.431863,-0.645996,0.098259,0.312438,0.198682,0.381043,0.009886,-0.220812,0.073221,-0.795371,-0.001049,0.269766,0.202934,-0.6928,0.148736,0.014512,0.003849,-0.586731,-0.753334,0.107392,-0.501824,-0.052725,0.072045,-0.55822,-0.038528,-0.272866,-0.129835,0.21828,-0.419997,-0.257231
4,49683,-0.049623,-0.534763,-0.707338,-0.477019,-1.50556,0.585213,-1.889815,0.392291,-0.086377,-0.028838,0.936506,0.451442,1.019759,-0.060308,0.924176,0.840881,0.20544,-0.703371,-0.548277,0.425135,0.583462,1.040204,-1.585193,-0.076489,0.908613,0.18134,1.426365,-0.156262,0.213217,-0.624737,-1.338209,0.710292,-0.24535,1.142536,0.619312,0.016358,-0.690533,1.014226,0.017052,-0.743026,-1.185745,0.378496,-0.316733,-0.899677,1.357139,-0.92323,-0.073746,0.283599,2.042454,0.05864,0.464912,-0.180829,-1.116603,-0.111913,-0.211854,-0.390978,0.079483,-0.068723,0.006308,0.417501,1.09326,1.467909,1.237343,0.43038,-0.795036,0.187856,-0.84144,-0.220719,-0.509319,0.971712,0.522735,-0.697037,-1.113412,0.474727,-0.081141,-0.263586,-0.149639,0.100196,1.237195,-0.50705,-0.124594,0.299179,-0.230294,-0.170366,0.21444,-0.217718,0.316395,-0.933991,-0.726848,0.805449,1.057533,-0.618818,-1.067718,-0.025348,0.152755,-0.642574,0.542651,-0.009784,-0.608545,1.254043
5,10749,0.645574,-0.545723,-0.393763,-0.648167,2.032516,-0.376805,-0.065862,-1.10282,-1.021526,-1.810159,0.321852,1.398401,-0.812233,-1.098623,-1.348334,-0.070146,-0.583207,0.031744,0.631258,-0.967493,1.043396,-0.337173,0.501597,-0.779834,-0.791076,1.844341,0.231213,-0.380001,0.07568,0.570225,0.668406,-0.73104,0.495195,0.019183,0.957285,-0.206571,0.149202,-1.203705,-0.003303,-0.61665,0.506839,-0.3952,0.901999,-0.537309,-0.630042,0.664708,-0.874858,-0.856624,-0.239243,0.487979,-0.984103,-0.739511,-1.864703,-0.054747,-0.210808,0.939329,0.586178,0.342434,-1.227133,-0.232737,1.638732,-1.389683,0.806077,-1.104511,0.124272,-1.063019,-1.186152,-1.766789,1.028826,0.41765,0.290937,-1.174519,-0.634217,0.782197,-0.74008,0.465363,0.360025,0.284332,-0.406627,-0.364404,-0.164608,-0.227985,1.357166,-0.32583,0.13903,0.95553,-0.393255,-0.554762,0.823933,0.573133,-1.10023,-0.088634,0.262438,0.570447,0.049114,-0.83586,-0.517945,-0.267511,-0.394208,-0.104273
6,1244,1.167482,-1.824675,-0.000657,0.214223,0.135031,-1.113626,-0.181973,-0.480506,-0.186647,0.180691,0.755087,0.851954,-1.021616,0.589651,-0.677689,-0.458203,0.57166,0.845086,-0.000882,-1.68578,1.411893,0.306602,0.631874,-0.277806,-0.917689,1.202549,-1.88745,0.126046,0.439075,-0.255589,0.733304,0.236376,0.006376,-0.374914,0.741193,-0.529836,0.098084,-1.305673,1.760272,-0.286633,-0.453928,-0.388195,0.767929,-0.914401,-0.385411,1.761825,-0.902061,-1.904744,-0.973367,0.336977,-1.136791,0.366595,-0.8306,-0.810949,-0.948222,1.084326,-0.237001,1.769902,-1.620376,0.495024,3.294459,-0.28993,0.700462,0.453867,-0.101414,0.489078,-0.606938,-1.535769,0.246308,1.628106,2.120446,-0.546357,-0.521431,0.733549,-0.394643,-0.261685,1.139693,1.304614,0.482331,0.189009,0.445727,0.112337,-0.005788,-0.933175,0.372464,0.171226,0.557263,-0.628716,-0.174228,0.47454,-0.880423,1.621027,1.531805,0.640519,0.065501,-0.939629,-0.911412,0.580552,-1.066682,-0.055762
7,24549,0.537242,0.084371,0.856804,1.553439,-0.700766,-0.086313,-1.845856,0.528019,1.98028,0.523658,0.636062,0.107429,0.17834,-0.753219,0.141916,1.463032,1.061287,-0.174397,0.065489,0.197293,0.96381,1.126998,-1.399552,-0.489506,0.22646,-0.216752,-0.577918,2.113178,-1.644146,-0.282701,1.118914,-1.321378,0.644366,0.263169,-0.546911,-0.597486,-0.852419,-0.652368,0.782375,-0.416092,-0.177159,-0.191269,0.928932,1.270393,-0.889399,-1.518192,0.251023,1.433392,-0.092335,0.134467,-0.958144,0.721561,0.58064,-0.756938,0.962579,0.490791,-0.721272,2.353241,-0.087374,-1.648346,-1.095874,-0.179073,2.306875,-1.206472,1.059403,-0.605918,0.124758,-1.563219,0.721532,-0.711832,1.748825,-0.720733,-0.179139,1.711,0.289804,0.817922,-0.19155,0.472988,-2.500397,0.18352,-0.131575,-1.303024,-0.45516,1.725162,-0.331795,0.864578,-0.063119,-0.228587,-0.247452,-1.981557,0.743804,-0.402773,1.887477,0.087734,-0.433637,-0.230832,-0.615138,0.464756,1.269189,0.125945
8,32192,1.439429,0.811559,-0.122025,0.406574,-0.406231,-0.206368,-0.593515,-0.131405,-0.702731,0.265212,-1.429797,-0.80737,-1.204991,-0.132231,0.06462,-0.233888,0.787059,0.452166,0.130484,1.361783,-1.417143,-0.661384,-1.27537,-0.851455,-0.553551,0.525029,0.572607,-0.366949,0.004244,-0.157267,1.373517,0.088249,0.23166,-0.999208,-1.208114,1.65884,-0.37409,0.049595,-0.432088,-0.18644,-1.126535,0.697217,0.289748,1.551484,0.77453,-0.878347,0.012486,-0.458575,0.042718,0.102588,-0.244002,-0.123589,-0.661967,-0.399119,-0.232336,0.880266,0.422695,-0.650166,0.345803,-1.00239,-0.701907,0.560951,-0.180103,0.218222,0.603974,0.579493,-0.005074,0.677266,-0.775609,0.230677,1.038542,0.121798,0.07216,0.641429,-0.223722,-0.648307,0.228176,0.357181,0.941371,0.483523,-1.008186,-0.473454,0.101563,0.193801,-0.481233,1.428299,-0.056349,0.918364,0.032688,-0.621941,-0.136032,-0.154511,0.353988,0.072266,1.009315,-0.487643,-0.229117,-1.121546,-0.100319,0.056383
9,38777,0.621222,-1.377849,-0.194323,0.478726,0.589449,0.199232,-0.317532,-0.088121,0.258901,0.340827,0.110791,0.404158,0.763451,-0.252299,0.066437,-0.397476,0.179455,-0.455909,-0.31629,0.569588,0.356162,-0.023918,-0.191066,-0.236628,-0.01431,0.245127,-1.039256,0.462992,-0.187963,-0.276525,0.000189,-0.128162,0.461991,-0.452035,0.130445,0.531785,0.576413,-0.455512,0.581332,0.746371,0.472588,0.515113,-0.416082,-0.70041,0.058143,1.094586,-0.186239,-0.782022,0.229512,0.143029,0.471555,-0.089087,-0.464384,0.560773,0.189747,0.499263,0.120004,0.057445,-0.260132,0.1122,0.619624,-0.285313,-0.416213,-0.394584,0.267156,0.103544,-0.422658,-0.997863,0.424626,0.650213,-0.19727,-0.762006,0.371641,0.316861,0.055102,0.703557,0.649634,0.136683,-0.224295,-0.076791,0.210587,0.369158,-0.149722,-0.120058,-0.499743,0.199556,-0.028547,0.0896,-0.017268,-0.432484,0.574678,0.555396,0.117178,0.654623,0.854425,-0.867769,-0.92099,0.03975,-0.143283,-0.433473


## Merge product vectors with product info

In [135]:
#===============================================
# merge relevant columns
#===============================================

# relevant columns from product info
prod_info_cols = ["product_id", "product_name", "department", "aisle"]

# merge
word_vec_df = pd.merge(products[prod_info_cols], word_vec_df, on = "product_id", how = "inner")
print(word_vec_df.shape)

(14340, 104)


In [136]:
display(word_vec_df.head(10))

Unnamed: 0,product_id,product_name,department,aisle,wv1,wv2,wv3,wv4,wv5,wv6,wv7,wv8,wv9,wv10,wv11,wv12,wv13,wv14,wv15,wv16,wv17,wv18,wv19,wv20,wv21,wv22,wv23,wv24,wv25,wv26,wv27,wv28,wv29,wv30,wv31,wv32,wv33,wv34,wv35,wv36,wv37,wv38,wv39,wv40,wv41,wv42,wv43,wv44,wv45,wv46,wv47,wv48,wv49,wv50,wv51,wv52,wv53,wv54,wv55,wv56,wv57,wv58,wv59,wv60,wv61,wv62,wv63,wv64,wv65,wv66,wv67,wv68,wv69,wv70,wv71,wv72,wv73,wv74,wv75,wv76,wv77,wv78,wv79,wv80,wv81,wv82,wv83,wv84,wv85,wv86,wv87,wv88,wv89,wv90,wv91,wv92,wv93,wv94,wv95,wv96,wv97,wv98,wv99,wv100
0,1,Chocolate Sandwich Cookies,snacks,cookies cakes,0.744708,1.295512,-1.193265,0.598794,-0.641248,1.223336,0.919235,0.809076,0.790224,-1.406452,0.579844,-0.417504,0.320172,-1.006021,-1.247783,0.728549,0.282868,-0.163652,0.442921,0.128002,1.018741,-1.040465,0.126721,0.412622,0.333163,0.891142,-0.438378,-0.414526,-0.543069,-1.434892,-0.36965,-0.234173,-0.450169,0.144263,-0.889828,-0.092579,0.095353,-0.893399,-0.008923,-0.70394,-0.921108,-1.429784,0.343726,-0.504728,-1.58324,0.781951,-0.361618,-0.487692,0.108233,-1.003854,0.44551,0.412661,2.036789,0.664207,-0.936389,0.153528,-0.561427,0.708672,-1.219241,-0.778229,0.468867,-0.237254,-0.688795,0.463018,0.221844,-1.234477,-0.520427,0.230692,-1.529698,0.867693,-0.80247,-0.222265,-0.507642,0.082833,-0.856651,0.124394,-0.133245,-1.038114,-0.809275,-0.205799,-0.915241,-0.091222,-0.302496,0.530782,0.226141,-0.134771,-0.599206,-0.370918,0.447913,0.08971,-0.735691,-0.266758,-1.274789,-1.471167,0.414221,-0.482969,-0.722963,-0.789088,-0.890457,0.141746
1,3,Robust Golden Unsweetened Oolong Tea,beverages,tea,-0.156999,-0.097948,1.033049,-0.075712,-0.156379,0.859535,-0.202249,-0.69284,1.155985,-0.154656,1.157791,-0.287568,0.291294,0.716055,-0.463947,-0.26647,-0.43225,-0.462481,-0.609656,0.9397,0.484884,-0.209098,0.338069,1.126593,-0.584675,-1.76212,-1.089806,1.540631,0.531554,0.231101,0.813328,0.308757,-1.116319,0.098051,0.718829,0.123319,-0.361023,-0.134135,0.584352,-0.445583,0.308385,0.375197,-0.108583,-1.362428,0.132718,-1.448387,-0.245276,1.475555,1.410261,-0.275512,0.245381,-0.178379,0.160996,-0.317731,-0.28679,-0.621966,1.150026,-1.059867,-1.085093,-0.115807,0.323545,-1.090988,0.613781,0.329271,1.433473,0.560549,-0.153129,-0.580917,-1.053341,0.052376,-0.34375,1.419617,0.731421,0.625508,0.882422,-0.762834,-0.411799,0.003922,-1.166005,-0.027614,0.316082,-0.358464,-0.382854,1.93014,-0.251959,0.190719,0.46801,0.38275,0.392147,-0.921297,0.853766,0.890511,0.992793,0.780575,0.511955,0.541127,-0.39919,0.485569,-0.989308,-0.412728
2,4,Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce,frozen,frozen meals,0.632169,-0.042714,0.447641,0.565454,-0.173308,-1.259952,-0.374098,0.565182,-0.501129,-0.326884,-1.161234,0.58826,1.618659,0.294504,0.15479,0.866748,-1.177649,-0.134442,-0.567716,-0.921026,0.309276,-0.598171,2.971751,-0.787572,-0.64975,-0.231039,1.806901,1.559718,0.50179,0.281565,0.014173,0.633412,0.325695,0.359038,-0.0701,0.124627,-0.962404,-0.134815,-0.145717,1.183958,0.241399,0.964502,1.174005,1.029604,0.15526,-0.624118,-0.044555,0.265823,1.074033,-1.058899,-0.206956,0.068395,1.224924,0.802075,-0.673816,2.257773,-1.068509,0.063531,1.522198,0.739853,1.355033,0.423484,0.501739,0.359911,0.319578,-0.454862,0.050564,0.328704,0.205272,-0.856989,0.670041,1.876261,1.597733,1.334862,-0.317515,-0.957292,-1.040504,-0.442202,-0.018599,0.2461,0.799682,-0.482413,-1.251701,-1.626282,-0.359774,0.296589,-0.049335,0.718455,1.660907,-0.985967,2.556258,-0.344786,-0.415751,-1.505674,-0.344438,0.000373,-0.931319,2.007142,-0.076104,-0.533512
3,10,Sparkling Orange Juice & Prickly Pear Beverage,beverages,water seltzer sparkling water,-0.253488,0.374834,-1.06959,0.956164,-0.311371,1.789404,0.26726,0.387516,-0.654127,1.027248,1.398794,0.410207,0.221405,1.188907,0.842219,0.270739,-1.056333,-1.88476,1.011292,0.223562,-0.034032,-0.055429,0.418091,0.480409,1.238376,0.885619,0.360691,3.544943,-0.051665,-0.017406,-0.987518,0.134707,-0.523529,0.258269,-0.30471,-0.891016,-0.382807,-0.938065,-0.871738,-0.025178,-0.21168,-0.965195,-0.190047,-1.046221,0.048076,-0.790697,1.053874,1.348278,1.777241,-0.112067,0.000398,-0.285349,0.411035,-1.454198,-0.068388,-0.343851,0.311053,-0.464327,-0.959483,1.488464,-1.142036,-1.981344,0.644177,-0.701146,0.385863,0.544416,-0.302204,0.667317,0.317814,-0.475939,0.729246,0.753712,0.588491,-0.187501,0.174661,-0.5897,1.184333,-0.253011,-1.038517,0.01551,-0.417327,-0.130255,0.523872,-0.082021,0.122164,0.55126,-0.024936,-0.428157,1.421382,0.789992,1.751679,-0.564681,-0.353809,0.96465,-0.701595,0.662892,-0.655238,0.252089,0.170364,-0.982335
4,12,Chocolate Fudge Layer Cake,frozen,frozen dessert,1.327038,1.344844,-0.715764,-0.140443,1.049643,-1.912242,0.343646,0.159571,-0.79021,0.403136,-0.281968,0.412603,0.347124,1.009087,0.802364,-0.692366,1.123618,-0.177917,0.107383,-1.384778,0.550639,-0.357951,-0.501989,0.804199,0.894451,-1.266077,-1.041235,-0.659717,-0.556226,-0.151734,-0.490867,-0.322606,0.701626,-0.448401,0.353176,-0.590934,-0.581699,-0.513539,0.366323,1.075075,0.502353,-0.451745,-0.928443,0.407792,0.541153,-1.002005,0.241243,-0.807081,1.255953,-0.177165,-0.706863,-0.288796,-0.049234,0.116173,-0.272984,0.631281,-0.386951,-2.198615,-0.302028,-0.46028,0.325847,0.030885,-2.105733,-0.270276,1.108249,0.379686,-0.683982,0.882951,0.176824,-0.823371,0.993661,-0.008259,0.203629,1.187102,0.553098,-0.10743,0.448566,0.580543,-0.688241,1.708184,0.44948,-1.134583,1.173021,-0.158643,0.477433,0.029551,0.278665,-0.013011,-1.108271,0.093632,0.029749,0.551239,-0.421093,-1.260726,0.405004,-0.380647,0.492613,0.691278,0.682424,-0.482249
5,23,Organic Turkey Burgers,meat seafood,packaged poultry,1.281689,1.048959,-0.087085,-0.259186,-0.41775,0.409531,-0.595253,0.745968,-0.450151,-0.047111,0.056733,1.116437,0.861204,-0.395207,-0.584873,-0.651139,-0.617091,1.195289,0.56252,0.331357,0.763006,0.769858,0.069841,0.29765,0.836339,-0.241667,0.974203,-0.348714,-0.601945,0.630662,0.56235,-1.417201,0.23381,-0.579337,1.458099,-0.150728,-0.004849,0.582879,-0.113335,0.627711,-1.29425,1.653909,-0.254007,0.560506,-0.912542,-0.60036,-1.010657,1.168578,-0.796146,-0.259093,0.277253,-0.89189,0.277,-0.817577,0.596441,-0.051523,-0.921772,-0.592638,1.776701,0.159778,0.066023,0.366919,0.43034,-0.610881,-0.48897,0.391611,-0.859604,-0.577904,0.62155,-0.338586,-0.246057,-1.836237,-0.957246,0.825464,0.366084,-0.917018,0.076385,-1.110705,0.380992,1.050979,-0.236705,0.103676,-0.513434,-0.679433,0.222142,0.004132,-0.127225,-0.501494,0.646623,0.50386,0.890724,-0.595622,-0.338392,0.451462,1.401664,-0.631341,-1.221392,0.302681,-0.342512,-0.041452
6,25,Salted Caramel Lean Protein & Fiber Bar,snacks,energy granola bars,0.704936,0.527594,-0.475766,0.067871,0.984812,1.075257,-1.00641,-0.84898,0.164817,-0.694122,-0.858648,0.779288,3.092391,1.050593,-0.034361,0.696025,-1.684608,0.124925,-1.535254,-0.288634,0.666161,-0.630741,0.411031,0.133784,-0.341806,-2.152892,-0.865416,-0.224923,0.07162,-0.099577,-0.056801,0.544065,2.118198,-0.337014,-0.159262,0.275371,0.672277,1.183733,0.063948,0.093446,0.352131,-1.317648,-0.495123,0.29075,0.283799,-0.911738,0.052114,-1.439304,-1.440612,-1.647701,-1.04306,-0.065286,-1.257465,0.521029,0.712134,-0.327804,-0.797467,1.466682,0.300995,1.538655,0.157716,0.329096,0.350415,-0.46552,0.136987,1.000664,1.11993,-2.255718,0.922869,-1.994733,1.333807,1.17683,-0.992273,-0.330782,-0.411566,-1.977853,0.125064,0.05945,-0.455411,0.003345,1.619707,0.247498,0.861991,0.500372,1.801972,2.886704,0.790417,-1.375822,-1.843633,-0.772261,0.662342,-0.105223,-0.373791,0.486574,0.021705,0.158155,0.534222,0.6782,-0.169715,0.545667
7,26,Fancy Feast Trout Feast Flaked Wet Cat Food,pets,cat food care,-0.25993,0.479054,0.024661,-0.974695,0.613846,-0.443279,1.217526,-0.139749,0.159328,-0.975453,2.365123,-0.199709,2.725972,-1.561348,0.871912,-1.512866,-0.64549,0.299155,0.716994,1.545311,0.166697,-1.288145,0.09195,1.136037,-0.337469,-1.059111,-0.069138,0.499691,-0.078959,2.206135,0.659091,-0.456524,0.315582,-2.157763,0.435822,0.659313,0.482266,-0.927864,-0.003133,0.881113,-1.201548,-1.370942,2.1467,-0.676909,1.899045,1.216807,-0.021967,0.233578,1.347754,0.147646,-1.287271,1.272144,0.970496,-0.846075,-1.04565,1.422048,1.107175,-1.341389,0.613931,0.832464,-0.10832,1.837682,0.063495,-0.459702,1.215817,-1.332855,0.222142,0.482275,-0.245079,-0.826059,0.196069,1.595432,0.019567,0.848055,1.254083,-0.293945,-0.856024,0.362408,-1.621047,-1.857358,0.807549,-0.243153,2.209505,3.598671,0.766132,0.634362,0.015468,0.892359,-0.313393,-0.777657,-0.091984,0.608246,-0.604637,0.547842,2.109301,-0.441909,-3.178446,0.897954,-0.31346,1.518307
8,28,Wheat Chex Cereal,breakfast,cereal,1.551934,-0.063192,-0.126314,0.054126,0.358737,0.194197,0.906389,-0.212437,0.634078,1.009118,-0.537664,0.097468,-1.044648,-1.020048,-0.417367,0.381773,-0.746397,-0.231332,0.096689,0.362068,-0.743048,-1.272645,-0.303648,-0.632498,-0.216862,0.003694,1.527987,0.264572,-0.626162,-0.461936,2.192944,0.356213,-0.532494,-0.400109,0.382776,0.305221,-0.500394,0.495915,0.660836,-0.011022,-0.511125,0.236746,-0.917441,0.863931,0.456143,0.281974,-0.844871,0.272576,-0.276962,-0.586735,-1.545401,-0.888963,0.950916,-0.252548,-0.188981,-0.302334,1.324622,0.278279,1.409999,-0.299727,0.169171,-1.258821,-0.724491,0.393844,0.586699,-0.548479,0.298617,0.291076,-0.070926,0.54922,-0.177431,0.396786,-0.235213,-1.852426,-0.833704,-0.13567,-0.221717,0.793202,-0.122024,0.621324,0.93873,-0.354383,-0.082007,0.200955,1.726734,0.172192,0.423436,1.599597,-0.599566,-0.550859,0.551727,0.831676,0.88752,-1.81873,0.17402,-0.159686,-0.213589,-0.185836,1.071417,-0.431435
9,29,Fresh Cut Golden Sweet No Salt Added Whole Kernel Corn,canned goods,canned jarred vegetables,-0.099535,0.026925,0.092476,0.040248,0.219087,-0.41171,-0.085742,0.118383,0.354839,0.216073,-0.419646,0.034342,0.322212,-0.962055,0.497153,-1.257611,-0.352518,-1.605562,-0.774982,0.082881,0.671315,-0.919024,0.647485,-0.317767,0.762318,-0.529241,0.406091,-0.115517,0.162307,0.384497,-0.206539,-0.28806,0.029108,-0.197216,0.607624,-1.182837,-1.029887,-0.082268,0.226504,0.352733,-0.039552,-0.759861,-0.079501,1.087697,-0.711835,-0.78451,0.125496,0.870484,-1.085535,0.384175,-0.641595,-0.037602,0.701198,-0.254137,-0.837875,1.1407,1.041522,0.189193,0.11809,-0.194093,0.120151,0.595512,-0.175523,1.014627,-0.866555,0.73841,-0.175047,-0.672686,1.470878,0.179683,0.132148,0.016398,-0.031339,-0.717995,-0.359915,-0.58154,0.036269,0.42813,0.577695,0.297692,1.562186,-0.416386,0.191496,-0.393401,0.312084,0.145899,0.273367,0.32845,-0.346311,-0.187861,0.156926,-0.040625,-0.490074,-0.517148,-0.40848,1.033066,0.542465,0.262616,-0.649964,-0.491701


## 5.2. Fit t-sne

In [137]:
# =================================================
# fit t-sne
# =================================================
tsne = TSNE(n_components = 2, verbose = 1, perplexity = 35, n_iter = 400)

In [138]:
# fit
t0 = time.time()
tsne_fit = tsne.fit_transform(word_vec_df[w2v_vec_names])
t1 = time.time()

[t-SNE] Computing 106 nearest neighbors...
[t-SNE] Indexed 14340 samples in 0.077s...
[t-SNE] Computed neighbors for 14340 samples in 24.032s...
[t-SNE] Computed conditional probabilities for sample 1000 / 14340
[t-SNE] Computed conditional probabilities for sample 2000 / 14340
[t-SNE] Computed conditional probabilities for sample 3000 / 14340
[t-SNE] Computed conditional probabilities for sample 4000 / 14340
[t-SNE] Computed conditional probabilities for sample 5000 / 14340
[t-SNE] Computed conditional probabilities for sample 6000 / 14340
[t-SNE] Computed conditional probabilities for sample 7000 / 14340
[t-SNE] Computed conditional probabilities for sample 8000 / 14340
[t-SNE] Computed conditional probabilities for sample 9000 / 14340
[t-SNE] Computed conditional probabilities for sample 10000 / 14340
[t-SNE] Computed conditional probabilities for sample 11000 / 14340
[t-SNE] Computed conditional probabilities for sample 12000 / 14340
[t-SNE] Computed conditional probabilities for s

In [139]:
# time taken
f"Time Taken: {t1 - t0}"

'Time Taken: 187.86533784866333'

## T-sne component data frame

In [140]:
#===============================================
# create t-sne data frame
#===============================================

tsne_df = word_vec_df[["product_name", "department", "aisle"]]

# extract t-sne dimensions
tsne_df["x_tsne"] = tsne_fit[:,0]
tsne_df["y_tsne"] = tsne_fit[:,1]
print(tsne_df.describe())

             x_tsne        y_tsne
count  14340.000000  14340.000000
mean   0.081110     -0.013953    
std    10.179037     7.783475    
min   -22.075830    -19.978592   
25%   -7.855527     -5.532616    
50%   -0.820201     -0.028690    
75%    8.695611      5.579124    
max    22.841021     21.091007   


## Plot with ggplot

In [141]:
#===============================================
# subset data for plot
#===============================================

# select only top departments
select_dept = ["produce", "babies", "beverages"]
tsne_plot_df = tsne_df.loc[tsne_df["department"].isin(select_dept), :]
print(tsne_plot_df.shape)

(2726, 5)


In [None]:
tsne_plot = ggplot(tsne_plot_df, aes(x = "x_tsne", y = "y_tsne", color = "department") ) \
        + geom_point(size = 70, alpha = 0.5) \
        + ggtitle("T-sne on product vectors") \
        + xlab(" ") + ylab(" ")
tsne_plot

---

## In-class exercise

## Learning embeddings using Skip-gram

- We will now use another method to train embeddings called skip-gram

In [None]:
#===============================================
# model parameters
#===============================================

# size of embedding matrix
emb_size  # set between 50-100

# context window size
cxt_window # set between 2-10 

# batch size for gradient update
batch_size # set between 2000 to 10000

# learning rate
lr # set between 0.001 to 0.01

In [None]:
#===============================================
# define and train model
#===============================================

t0 = time.time()
### Write model code here
t1 = time.time()

In [None]:
# time taken
f"Time Taken: {t1 - t0}"

In [None]:
# find the most similar products to 
prod_id = "1000"

# product info for sample product
display(products.loc[products["product_id"].isin([prod_id]), :])