# Word2Vec Basket Analysis on Instacard Dataset
### Using UMAP and Tensorflow Projector

In [1]:
import io
import random

import pandas as pd
from gensim.models import Word2Vec

pd.__version__

'1.1.4'

### Prepare Data from _InstaCard 2017_ dataset:
"The Instacart Online Grocery Shopping Dataset 2017”<br/>Accessed from https://www.instacart.com/datasets/grocery-shopping-2017 on March 23, 2020

In [2]:
insta_items = pd.read_csv("instacart_2017_05_01/order_products__train.csv")
insta_products = pd.read_csv("instacart_2017_05_01/products.csv")

### Denormalize and clean dataset. Use only the train ```eval_set```.

In [3]:
columns_to_keep = ["order_id", "product_id", "product_name"]
df = insta_items.merge(insta_products, on="product_id", how="left")
df = df[columns_to_keep]
df = df.dropna()
df["product_id"] = (
    df["product_id"].astype("int").astype("str")
)  # Conver product key to str for Word2vec

In [4]:
df[["order_id", "product_id"]].nunique()

order_id      131209
product_id     39123
dtype: int64

### Reduce the training dataset to a sample of 25_000 orders

In [5]:
random.seed(42)
sample_orders = random.sample(df["order_id"].unique().tolist(), 25_000)
len(sample_orders)

25000

### Generate a list of the ```product_id``` for all the sample orders

In [6]:
purchases = (
    df[df["order_id"].isin(sample_orders)]
    .groupby("order_id")["product_id"]
    .apply(list)
    .tolist()
)

print("Max products in one order: ", max(len(i) for i in purchases))

Max products in one order:  76


### Train the Word2Vec model


In [7]:
model = Word2Vec(
    window=50,
    size=200,
    sg=0,
    hs=0,
    negative=10,
    alpha=0.03,
    min_alpha=0.0007,
    seed=42,
    workers=4,
    min_count=5,
)

model.build_vocab(purchases)
model.train(purchases, total_examples=model.corpus_count, epochs=20, report_delay=1)

print(model)

Word2Vec(vocab=8230, size=200, alpha=0.03)


### Create ```product_id``` and ```product_name``` dictionary to be used for TensorFlow projector

In [8]:
products_dict = df.groupby("product_id")["product_name"].first().to_dict()

### Create the ```vecs.tsv``` and ```meta.tsv``` files to be imported to https://projector.tensorflow.org/

In [9]:
out_vect = io.open("vecs.tsv", "w", encoding="utf-8")
out_meta = io.open("meta.tsv", "w", encoding="utf-8")

for i in model.wv.vocab:
    out_meta.write(products_dict[i] + ": " + str(i) + "\n")
    out_vect.write("\t".join([str(j) for j in model[i]]) + "\n")

out_vect.close()
out_meta.close()

  out_vect.write('\t'.join([str(j) for j in model[i]]) + "\n")
