# Libraries

In [1]:
import gensim
import numpy as np
import pandas as pd
import weaviate
from ranx import Qrels, Run, evaluate


# Utility Functions

In [61]:
def build_training_corpus(train_df, itemid_docid_lookup):
    """Builds a list of TaggedDocument for training doc2vec model in gensim"""
    # lookup the doc id for the item purchased
    x = train_df.reset_index().set_index("item_purchase")\
        .join(itemid_docid_lookup.set_index("item_id"))\
        .set_index("session_id")

    # consolidate all item views into a single list by doc id
    x = x.groupby(["doc_id"])\
        .agg(lambda x: np.concatenate(x).tolist())\
        .rename(columns={"item_views": "tokens"})

    def f(x):
        return gensim.models.doc2vec.TaggedDocument(x.values[0], [x.index.values[0]])

    x = x.groupby("doc_id")\
        .agg({"tokens": f})\
        .rename(columns={"tokens": "tagged_doc"})

    return x[:]["tagged_doc"].tolist()


def compute_mrr(df):
    # this is the query id
    assert df.index.name == "session_id"

    # this is the ground truth i.e. item purchased
    assert "item_purchase" in df.columns

    # this is the items the model predicted will be bought given the session
    assert "predicted_purchase" in df.columns

    qrels_dict = {}
    run_dict = {}

    for query, correct_result in zip(df.index, df["item_purchase"]):
        qrels_dict[query] = {correct_result: 1}

    for query, proposed_results in zip(df.index, df["predicted_purchase"]):
        run_dict[query] = {item_id: rank for rank,
                           item_id in enumerate(proposed_results, 1)}

    qrels = Qrels(qrels_dict)
    run = Run(run_dict)

    return evaluate(qrels, run, "mrr")


def make_vectorizer_fn(model):
    def f(query):
        # query is a list of item views
        v = model.infer_vector(query)
        return v
    return f


def make_weaviate_inference_fn(client, topn=100):
    def f(query_vector):
        results = client.query.get("Item", ["itemid"])\
            .with_limit(topn)\
            .with_near_vector({
                "vector": query_vector
            })\
            .do()

        items = results["data"]["Get"]["Item"]
        return [item["itemid"] for item in items]

    return f


def make_gensim_inference_fn(model, itemid_docid_lookup, topn=100, ):
    vectorizer_fn = make_vectorizer_fn(model)

    def f(item_views):

        v = vectorizer_fn(item_views)

        predictions = model.dv.most_similar([v], topn=topn)

        predicted_docids = [doc_id for (doc_id, _) in predictions]

        predicted_itemids = itemid_docid_lookup.query(
            "doc_id in @predicted_docids")["item_id"].tolist()

        return predicted_itemids

    return f


def eval_prediction(actual_purchased_itemds, predicted_itemids):
    if actual_purchased_itemds in predicted_itemids:
        rank = (predicted_itemids.index(actual_purchased_itemds))
    else:
        rank = None

    return rank


# Datasets

In [3]:
train_df = pd.read_parquet("train_df.parquet")
valid_df = pd.read_parquet("valid_df.parquet")
test_df = pd.read_parquet("test_df.parquet")

itemid_docid_lookup = pd.read_parquet("itemid_docid_lookup.parquet")


# Preprocessing

Ignore really long sessions:

In [4]:
max_session_length = 15

is_short_session = train_df["item_views"].apply(len) <= max_session_length

train_df = train_df[is_short_session]


# Corpus

In [5]:
train_corpus = build_training_corpus(train_df, itemid_docid_lookup)


# Train

In [6]:
model = gensim.models.doc2vec.Doc2Vec(
    vector_size=200, window=3, negative=25, sample=1e-3, min_count=20, epochs=80)
model.build_vocab(train_corpus)


In [7]:
model.train(train_corpus, total_examples=model.corpus_count,
            epochs=model.epochs)


# Upload

Upload the document vectors to weaviate. Since the data is so simple, we can use auto schema:

In [38]:
client = weaviate.Client("http://weaviate:8080")
client.schema.delete_all()


In [40]:
client.batch(
    batch_size=100,
    dynamic=True,
    num_workers=2,
)

with client.batch as batch:

    for docid in range(len(model.dv)):
        vector = model.dv[docid]
        itemid = itemid_docid_lookup.query(
            "doc_id == @docid")["item_id"].values[0]

        batch.add_data_object(
            data_object={
                "docid": docid,
                "itemid": itemid
            },
            class_name="Item",
            vector=vector
        )


Double check the object count is correct:

In [41]:
len(model.dv)


19020

In [43]:
!curl weaviate: 8080/v1/nodes


{"nodes":[{"gitHash":"37d3b17","name":"node1","shards":[{"class":"Item","name":"bjZQwoKvmmO1","objectCount":19020}],"stats":{"objectCount":19020,"shardCount":1},"status":"HEALTHY","version":"1.17.0"}]}


# Evaluation

Evaluating on the training set should give decent results:

In [62]:
n = 100

vectorizer_fn = make_vectorizer_fn(model)
weaviate_inference_fn = make_weaviate_inference_fn(client, topn=n)
gensim_inference_fn = make_gensim_inference_fn(
    model, itemid_docid_lookup, topn=n)


To save time, evaluate only on a subset of the training set:

In [None]:
sample_df = train_df.sample(100_000)


Evaluate using weaviate:

In [65]:
sample_df["predicted_purchase"] = sample_df["item_views"].apply(
    lambda x: weaviate_inference_fn(vectorizer_fn(x)))
compute_mrr(sample_df)


0.00023322698464674509

Evaluate using gensim:

In [54]:
sample_df["predicted_purchase"] = sample_df["item_views"].apply(
    gensim_inference_fn)
compute_mrr(sample_df)


0.000328890214163869

Results are terrible! 😭