## Set working directory

In [2]:
import os
os.getcwd()

'/mnt/c/Users/jessi/OneDrive/Desktop'

In [3]:
os.chdir('/mnt/c/Users/jessi/OneDrive/Desktop')

In [4]:
os.getcwd()

'/mnt/c/Users/jessi/OneDrive/Desktop'

## Read in and prepare data

In [5]:
# Import the libraries to be used
import pandas as pd
import gensim
import numpy as np
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Read in the datasets we will be using. 
prior_orders = pd.read_csv('Instacart Data/order_products__prior.csv')
products = pd.read_csv('Instacart Data/products.csv')

In [7]:
prior_orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [8]:
#check for missing values, noting none.
prior_orders.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

In [9]:
# Split the dataset into train and test sets (80/20) split, based on order_id.
from sklearn.model_selection import GroupShuffleSplit
train_set, test_set = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 47).split(prior_orders, groups=prior_orders['order_id']))

train = prior_orders.iloc[train_set]
test = prior_orders.iloc[test_set]

In [9]:
# Because we will be training our model on product_id, we will be converting the column to string.
train["product_id"] = train["product_id"].astype(str) 

In [10]:
#Obtain sentences, where a sentence is the list of all product_ids per order.
sentences = train.groupby("order_id")['product_id'].apply(list)

In [11]:
#Save the longest sentest length. This will be used in training our model.
longest = np.max(sentences.apply(len))

In [12]:
# Convert our sentences to an array.
sentences = np.array(sentences)

## Train the word2vec model

NOTE: in order to be able to reproduce a trained model, the model must be limited to a single worker thread (workers = 1), as per the gensim documentation. However, running even the base model with workers = 1 was taking far too long that it was not feasible. If the below models are rerun, they will likely result in different results.

Source: https://radimrehurek.com/gensim/models/word2vec.html

In [None]:
# Train the base model skip-gram. We're using the longest sentence (defined above) as the window size and choosing to ignore no words (setting min_count at 0) to capture as many words as possible for when running this model with the train_order dataset below.
# sg = 1: 1 for skip-gram
# hs = 0: if 1, hierarchial softmax used
# ns_exponent = 0: samples all words equally
# iter = 5: default, number of iterations over the corpus

#model_base_split = gensim.models.Word2Vec(sentences, sg=1, size=100, window=longest, min_count=0, hs=0, ns_exponent=0, iter=5, workers=4)
#model_base_split.save('/mnt/c/Users/jessi/OneDrive/Desktop/model_base_split')

The calculation in a Word2Vec model is intensive, as each iteration sums over all words in the vocabulary. Negative sampling is a way to lessen this; instead of predicting the probability of words being nearby, we predict the probability if the words are nearby neighbors or not. To see if this improves our recommendations, we train a model with negative sampling: model_ns10_split.

Source: https://towardsdatascience.com/nlp-101-negative-sampling-and-glove-936c88f3bc68

In [None]:
# Train the model with negative sampling of 10.
#model_ns10_split = gensim.models.Word2Vec(sentences, sg=1, size=100, window=longest, min_count=0, hs=0, ns_exponent=0, negative = 10, iter=5, workers=4)
#model_ns10_split.save('/mnt/c/Users/jessi/OneDrive/Desktop/model_ns10_split')

Another parameter we can manipulate is vector size, so we train a model to output feature vectors of size 200, instead of size 100. This model is below: model_s200_split.

In [None]:
# Train the model with vector size of 200.
#model_s200_split = gensim.models.Word2Vec(sentences, sg=1, size=200, window=longest, min_count=0, hs=0, ns_exponent=0, iter=5, workers=4)
#model_s200_split.save('/mnt/c/Users/jessi/OneDrive/Desktop/model_s200_split')

## Create the dictionary of product_ids and product_names.

In [10]:
# We want to create a dictionary to be able to look up what each product_id is. To start, we merge merge prior_orders and products to have product_id and product_name in one dataframe.
prior_orders = pd.read_csv('Instacart Data/order_products__prior.csv')
prior_orders_merge = pd.merge(prior_orders, products, on="product_id")

In [11]:
prior_orders_merge

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
1,26,33120,5,0,Organic Egg Whites,86,16
2,120,33120,13,0,Organic Egg Whites,86,16
3,327,33120,5,1,Organic Egg Whites,86,16
4,390,33120,28,1,Organic Egg Whites,86,16
...,...,...,...,...,...,...,...
32434484,3265099,43492,3,0,Gourmet Burger Seasoning,104,13
32434485,3361945,43492,19,0,Gourmet Burger Seasoning,104,13
32434486,3267201,33097,2,0,Piquillo & Jalapeno Bruschetta,81,15
32434487,3393151,38977,32,0,Original Jerky,100,21


In [12]:
# Create dictionary of product_id and product_name.
prior_subset = prior_orders_merge[["product_id", "product_name"]]
prior_subset.drop_duplicates(inplace=True, subset='product_id', keep="last")
prior_dict = prior_subset.groupby('product_id')['product_name'].apply(list).to_dict()

## Look at recommendations of a random order on the prior_orders_test dataset

In [13]:
# Load models.
model_base_split = gensim.models.Word2Vec.load('/mnt/c/Users/jessi/OneDrive/Desktop/model_base_split')
model_ns10_split = gensim.models.Word2Vec.load('/mnt/c/Users/jessi/OneDrive/Desktop/model_ns10_split')
model_s200_split = gensim.models.Word2Vec.load('/mnt/c/Users/jessi/OneDrive/Desktop/model_s200_split')

In [14]:
# Look at an overview of the model. They each have the same number of words, which is expected.
print(model_base_split)
print(model_ns10_split)
print(model_s200_split)

Word2Vec(vocab=49641, size=100, alpha=0.025)
Word2Vec(vocab=49641, size=100, alpha=0.025)
Word2Vec(vocab=49641, size=200, alpha=0.025)


In [15]:
#test.sample(1)
#Result: order_id 201920

In [16]:
# Look at all of the items that were in order #207820 (randomly selected order).
test.loc[test['order_id'] == 207820]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
1970840,207820,22802,1,1
1970841,207820,38768,2,1
1970842,207820,6184,3,0
1970843,207820,21903,4,1
1970844,207820,10424,5,0
1970845,207820,33081,6,0


In [17]:
# Look up the product names of the items in order 207820.
temp_test = np.array(test['product_id'][test['order_id'] == 207820])

for row in temp_test:
    print(prior_dict[int(row)])

['Mineral Water']
['Sweet Kale Salad Mix']
['Clementines']
['Organic Baby Spinach']
['Triple Creme Brie']
['Chicken Apple Sausage']


In [18]:
# Convert the model into vectors.
word_vectors = model_base_split.wv
word_vectors_ns10 = model_ns10_split.wv
word_vectors_s200 = model_s200_split.wv

In [19]:
#Look up the index of order_id in the dataset containing a list of all unique order_ids used for predictions
x = test.loc[test['order_id'] == 207820]
x['product_id'] = x['product_id'].astype(str)
x = x['product_id']

In [20]:
# Calculate the average vector for order 207820 by taking each feature vector of each ID and taking an average.
xx = []
for i in x:
    a = model_base_split[i]
    xx.append(a)
    avg_base = np.mean(xx, axis=0)
    
yy = []
for i in x:
    a = model_ns10_split[i]
    yy.append(a)
    avg_ns10 = np.mean(yy, axis=0)
    
zz = []
for i in x:
    a = model_s200_split[i]
    zz.append(a)
    avg_s200 = np.mean(zz, axis=0)

In [21]:
# Run the averaged vectors through the model to get the similar items.
base_pred = model_base_split.similar_by_vector(avg_base)
ns10_pred = model_ns10_split.similar_by_vector(avg_ns10)
s200_pred = model_s200_split.similar_by_vector(avg_s200)

In [22]:
# Look at the predicted items
for row in base_pred:
    print(prior_dict[int(row[0])])

['Bing Dark Red Sweet Cherries']
['Grapefruit Holiday Box']
['Eternal Alkaline Water']
['Garlic Naan Bread']
['Natural Chicken Breast Strips']
['Premium Organic Quinoa']
['Organic Dry Roasted & Salted Cashews']
['Gold Nugget Mandarins']
['Calcium 600mg + D3 Tablets']
['Organic Sopressata']


In [23]:
for row in ns10_pred:
    print(prior_dict[int(row[0])])

['Bing Dark Red Sweet Cherries']
['Calcium 600mg + D3 Tablets']
['Garlic Naan Bread']
['Grapefruit Holiday Box']
['Gold Nugget Mandarins']
['Eternal Alkaline Water']
['Pure Mint With Herbal Accent Sugar Free Gum']
['White Flesh Nectarines']
['Organic Sopressata']
['Organic Dry Roasted & Salted Cashews']


In [24]:
for row in s200_pred:
    print(prior_dict[int(row[0])])

['Bing Dark Red Sweet Cherries']
['Garlic Naan Bread']
['Gold Nugget Mandarins']
['Eternal Alkaline Water']
['Calcium 600mg + D3 Tablets']
['Grapefruit Holiday Box']
['Organic Dry Roasted & Salted Cashews']
['Organic Sopressata']
['Natural Chicken Breast Strips']
['Premium Organic Quinoa']


With the original items being Mineral Water, Sweet Kale Salad Mix, Clementins, Organic Baby Spinach, Triple Cream Brie, and Chicken Apple Sausage, we have some items that appear to be reasonable predictions, such as the Bing Dark Red Sweet Cherries, Gold Nugget Mandarines, Eternal Alkaline Water, and White Flesh Nectarines. Our models do appear to be reasonable so far on the test part of the training dataset.

## Testing the model

In [25]:
# Read in the test set.
test_orders = pd.read_csv('Instacart Data/order_products__train.csv')

In [26]:
test_orders

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
...,...,...,...,...
1384612,3421063,14233,3,1
1384613,3421063,35548,4,1
1384614,3421070,35951,1,1
1384615,3421070,16953,2,1


In [27]:
# Convert the product_id column to strings.
test_orders["product_id"] = test_orders["product_id"].astype(str) 

In [28]:
# Check to see which product ids are not in the word2vec model vocabulary.

# Obtain a list of all the unique product ids in the test_orders_train dataframe
temp1 = test_orders["product_id"].unique()

In [29]:
# Check if the product ids are in the vocabulary.

check = []
for word in temp1:
    x = word in word_vectors.vocab
    check.append([word, x])
    
check_ns10 = []
for word in temp1:
    x = word in word_vectors_ns10.vocab
    check_ns10.append([word, x])
    
check_s200 = []
for word in temp1:
    x = word in word_vectors_s200.vocab
    check_s200.append([word, x])

In [30]:
# Convert the list to a dataframe.
check = pd.DataFrame(check)
check_ns10 = pd.DataFrame(check_ns10)
check_s200 = pd.DataFrame(check_s200)

In [31]:
# Check if the resulting dataframes are true (they should be, as all models have the same vocabulary).
print(check.equals(check_ns10))
print(check.equals(check_s200))
print(len(check[check[1] == False]))

True
True
21


Because the model was trained on the orders in the prior_order data, if a product is in the train_order dataset but not in the prior_order dataset, the model will not be able to use it to make a prediction. As such, we are using only the orders that have products in the prior_order dataset; there are only 21 items that are not included in the model vocabulary. Because all of the models had the same check results, we will proceed using just the check of the base model.

In [32]:
# Rename column names in the check dataframe for merging.
check = check.rename(columns={0: 'product_id', 1: 'exists'})

In [33]:
# Create new dataframe that indicates whether or not product ids exist.
test_orders_check = pd.merge(test_orders, check, on='product_id')

In [34]:
# Create new dataframe to have only product ids that exist.
test_orders_exist = test_orders_check[test_orders_check['exists'] == True]

In [35]:
# Look at the dataframe.
test_orders_exist.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,exists
0,1,49302,1,1,True
1,816049,49302,7,1,True
2,1242203,49302,1,1,True
3,1383349,49302,11,1,True
4,1787378,49302,8,0,True


In [36]:
# Obtain sentences.
sentences_train_true = test_orders_exist.groupby("order_id")['product_id'].apply(list)
longest = np.max(sentences_train_true.apply(len))

### Run the various models.

In [37]:
# Calculate the average of the vectors (per order) that result from putting each id through the model.
sentences_train_single_temp_base = []

for i in range(0,len(sentences_train_true)):
    x = np.mean(model_base_split[sentences_train_true.iloc[i]], axis=0)
    sentences_train_single_temp_base.append(x)

#for model with negative sampling 10
sentences_train_single_temp_ns10 = []

for i in range(0,len(sentences_train_true)):
    x = np.mean(model_ns10_split[sentences_train_true.iloc[i]], axis=0)
    sentences_train_single_temp_ns10.append(x)
    
#for model with vector size 200
sentences_train_single_temp_s200 = []

for i in range(0,len(sentences_train_true)):
    x = np.mean(model_s200_split[sentences_train_true.iloc[i]], axis=0)
    sentences_train_single_temp_s200.append(x)

In [38]:
# Run the averaged vectors through the model to get the similar items.
predictions_base = []

for i in range(0,len(sentences_train_single_temp_base)):
    x = model_base_split.similar_by_vector(sentences_train_single_temp_base[i])
    predictions_base.append(x)
    
#for model with negative sampling 10
predictions_ns10 = []

for i in range(0,len(sentences_train_single_temp_ns10)):
    x = model_ns10_split.similar_by_vector(sentences_train_single_temp_ns10[i])
    predictions_ns10.append(x)
    
#for model with vector size 200
predictions_s200 = []

for i in range(0,len(sentences_train_single_temp_s200)):
    x = model_s200_split.similar_by_vector(sentences_train_single_temp_s200[i])
    predictions_s200.append(x)

In [39]:
# Obtain a list of all the unique order_ids in the test_order_train dataset to merge with the recommendations into a dataframe.
test_order_ids = test_orders_exist["order_id"].unique()

In [40]:
# Create a dataframe.
preds_base = (test_order_ids, predictions_base)
preds_base = pd.DataFrame(preds_base)

preds_ns10 = (test_order_ids, predictions_ns10)
preds_ns10 = pd.DataFrame(preds_ns10)

preds_s200 = (test_order_ids, predictions_s200)
preds_s200 = pd.DataFrame(preds_s200)

In [41]:
# Transpose the dataframe.
preds_base = preds_base.T
preds_ns10 = preds_ns10.T
preds_s200 = preds_s200.T

In [42]:
# Rename columns.
preds_base2 = preds_base.rename(columns={0: 'order_id', 1: 'predicted products'})
preds_ns10_2 = preds_ns10.rename(columns={0: 'order_id', 1: 'predicted products'})
preds_s200_2 = preds_s200.rename(columns={0: 'order_id', 1: 'predicted products'})

### Compare predicted results versus the actual products per order.

In [43]:
#Randomly select an order_id.
#test_orders.sample(1)
#Result: 232454

In [44]:
# Look at all of the items that were in order #232454 (randomly selected order).
test_orders.loc[test_orders['order_id'] == 232454]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
93612,232454,14852,1,0
93613,232454,45,2,1
93614,232454,39993,3,1
93615,232454,44142,4,1
93616,232454,45066,5,0
93617,232454,35989,6,1
93618,232454,31553,7,1
93619,232454,36315,8,1
93620,232454,19488,9,1
93621,232454,881,10,1


In [45]:
# Look up the product names of the items in order 38.
temp_original = np.array(test_orders['product_id'][test_orders['order_id'] == 232454])

for row in temp_original:
    print(prior_dict[int(row)])

['All Natural Marinara Sauce']
['European Cucumber']
['Vine Ripe Tomatoes']
['Red Onion']
['Honeycrisp Apple']
['Coconut Flavored Sparkling Water']
['Fresh Ginger Root']
['Low Fat Split Pea Soup']
['Select-A-Size White Paper Towels']
['Heavy Duty Scrub Sponges']
['Tall Kitchen Bags, Drawstring, Lavender, 13 Gal, Mega Pack']
['Yellow Onions']
['100% Natural Beef Broth']
['Super Spinach! Baby Spinach, Baby Bok Choy, Sweet Baby Kale']
['Coffee, Coffee BuzzBuzzBuzz!® Ice Cream']
['Traditional Rope Hung Smoked Scottish Salmon']
['Green Tea With Ginseng and Honey']


In [46]:
#Look up the index of order_id in the dataset containing a list of all unique order_ids used for predictions
np.where(test_order_ids == 232454)

(array([84030]),)

In [47]:
# Look at the predicted items
temp = preds_base2.iloc[84030]['predicted products']
temp_ns10 = preds_ns10_2.iloc[84030]['predicted products']
temp_s200 = preds_s200_2.iloc[84030]['predicted products']

In [48]:
# The results from the base model appears to have recommended products most similar to the original products. 
for row in temp:
    print(prior_dict[int(row[0])])

['Cookies']
['Ice Cream Cake Celebration']
['Variety Pack Grab & Snack']
['Unsweetened Lemon Flavor Real Brewed Tea']
["Organic Fruit Snacks Bunch O' Berries"]
['Clear Strips']
['Whitening Plus Scope Crest Complete Multi-Benefit Cool Peppermint Flavor Liquid Gel Toothpaste']
['Original Cheesy Made Easy Macaroni & Cheese Dinner']
['Homestyle Pretzels']
['Natural Premium Coconut Water & Pineapple Juice From Concentrate']


In [49]:
#The below is the result displayed in the PowerPoint.
for row in temp_ns10:
    print(prior_dict[int(row[0])])

['Ice Cream Cake Celebration']
['Cookies']
['Unsweetened Lemon Flavor Real Brewed Tea']
['Variety Pack Grab & Snack']
["Organic Fruit Snacks Bunch O' Berries"]
['Natural Premium Coconut Water & Pineapple Juice From Concentrate']
['Strawberry Banana on the Bottom Greek Yogurt']
['Chips Deluxe Mini Rainbow Cookies']
['Clear Strips']
["Frosted St. Patrick's Day Cookies"]


In [50]:
for row in temp_s200:
    print(prior_dict[int(row[0])])

['Cookies']
['Ice Cream Cake Celebration']
['Variety Pack Grab & Snack']
['Clear Strips']
['Strawberry Banana on the Bottom Greek Yogurt']
['Unsweetened Lemon Flavor Real Brewed Tea']
['Special K Fudge Mini Brownies']
['Whitening Plus Scope Crest Complete Multi-Benefit Cool Peppermint Flavor Liquid Gel Toothpaste']
['Cool Brew Peach Black Iced Tea']
['Plenti Greek Coconut Low Fat Yogurt']


The original order included a variety of produce, soup, and general household items. 

All three models made recommendations that were reasonable, but while the base model had four reasonable recommendations (cookies and ice cream (desserts, like the ice cream), unsweetened lemon flavor real brewed tea (tea, like the green tea with ginseng and honey), and natural premium coconut water and pineapple juice from concentrate (coconut flavored water), the model with negative sampling and the model with the vector size of 200 had six and five, respectively.

The items in the negative sampling model that appear reasonable are: ice cream cake celebration, cookies, chips deluxe mini rainbow cookies, and frosted St. Patrick's Day cookies (desserts), unsweetened lemon flavor real brewed tea (tea), natural premium coconut water (coconut water).

The iteams in the model with vector size 200 that appear reasonable are: cookies, ice cream cake celebration, Special K fudge mini brownies (desserts), unsweetened lemon flavor real brewed tea and cool brew peach black iced tea (tea).

In this example, the negative sampling model performed the best (this was the result in the PowerPoint).

## Checking vectors of similar and dissimilar products

In [51]:
# Create a dataframe to have both product_id and product_name
prior_orders_merge = pd.merge(prior_orders, products, on = "product_id")

In [52]:
#Look at the top most similar items to Apple Juice
sim_apple_juice = model_ns10_split.most_similar("38200")[:5]
sim_apple_juice

[('32156', 0.8086172342300415),
 ('25146', 0.776132345199585),
 ('41290', 0.7720734477043152),
 ('39108', 0.7273651361465454),
 ('43967', 0.7117173671722412)]

In [53]:
# Look up what the similar products are.
for row in sim_apple_juice:
    print(prior_dict[int(row[0])])

['Cranberry Juice Cocktail']
['Original Orange Juice']
['Lemonade']
['Pulp Free Orange Juice']
['Raspberry Lemonade']


In [54]:
# Check the differences between vectors for similar products and dissimilar products
# We expect similar products to have more similar vectors and dissimilar products to have less similar vectors.
x = model_ns10_split["32156"]
y = model_ns10_split["25146"]
z = model_ns10_split["41290"]
aa = model_ns10_split["39108"]
bb = model_ns10_split["43967"]

In [55]:
# Look at the mean of the absolute value of differences of the vectors of the first item versus the frest
print(np.mean(abs(x-y)))
print(np.mean(abs(x-z)))
print(np.mean(abs(x-aa)))
print(np.mean(abs(x-bb)))

0.14101681
0.15557656
0.15521586
0.17296939


## Comparing product recommendation results to market basket analysis

In [56]:
# See if one of the recommended items for Banana is Organic Fuji Apple, as that was identified as a strong relationship.
prior_dict[24852]

['Banana']

In [57]:
#Look at the ten top most similar items to bananas
sim_bananas = model_ns10_split.most_similar("24852")[:10]
sim_bananas

[('28204', 0.869930624961853),
 ('4920', 0.8174550533294678),
 ('45066', 0.8151119947433472),
 ('46385', 0.8117666840553284),
 ('47144', 0.8072843551635742),
 ('6662', 0.8040908575057983),
 ('21956', 0.7975308299064636),
 ('40221', 0.797337532043457),
 ('9818', 0.7961570024490356),
 ('39651', 0.7952178716659546)]

In [58]:
# Look up what the similar products are.
for row in sim_bananas:
    print(prior_dict[int(row[0])])

['Organic Fuji Apple']
['Seedless Red Grapes']
['Honeycrisp Apple']
['Apple Wedges']
['Unsweetened Original Almond Breeze Almond Milk']
['Sliced Red Beets']
['Yogurt, Greek, Nonfat, Strained, Blended with Strawberry']
['Instant Rice, Enriched Long Grain Rice']
['Cld/Flu Van Chrry']
['Honey Lemon Flavor Oral Mist Cold Remedy']


## Comparison of recommendations of toilet paper

In [59]:
prior_dict[31353]

['Toilet Tissue Rolls']

In [60]:
#Look at the ten top most similar items to bananas
sim_bananas = model_ns10_split.most_similar("31353")[:5]
sim_bananas

[('42705', 0.9007684588432312),
 ('4904', 0.8948682546615601),
 ('49162', 0.8912460803985596),
 ('10579', 0.8908408284187317),
 ('12775', 0.8886433839797974)]

In [61]:
# Look up what the similar products are.
for row in sim_bananas:
    print(prior_dict[int(row[0])])

['Proactive Health Sensitive Stomach Cat Food']
['Whisk Broom with Dust Pan']
['Gold Temptation Refreshing Shower Gel']
['Himalayan Pink Salt Liquid Hand Soap']
['Berries Strawberry']
