### Import and global variables

In [None]:
import os
import csv
from datetime import datetime
import string
import json
import random
import numpy as np
import matplotlib.pyplot as plt # for viz purposes
from sklearn.manifold import TSNE  # for viz purposes
import redis  # to communicate with redis
import gensim # to talk to gensim
from sklearn.decomposition import PCA # run PCA
from IPython.display import Image  # to display URL in noteboook for visual debug
from IPython.core.display import display # to display URL in noteboook for visual debug
from elasticsearch import Elasticsearch, helpers # remember to !pip install elasticsearch

In [None]:
CATALOGUE_FILE = './data/catalog.csv'  # put here your catalog file
SESSION_FILE = './data/sessions.txt' # file with session data (pre-filtered for length and pre-formatted)
TEXT_FILE = '/tmp/corpus.txt'  # texts from 1BN words dataset
EMBEDDING_DIMS = 50 # specify embedding dimesions in ES (we will use PCA -> see below)
PRODUCTS_IN_SESSION = [] # list of product ID the user visited in the present session
LANGUAGE = 'english'  # put here the ES compatible language string (depending on the language of your catalog/search queries)
QUERY1 = 'shoes' # put here the first query to test
QUERY2 = 'pants' # put here the second query to test

### Python clients for Redis and ES

In [None]:
# redis credentials here!
REDIS_HOST = 'redis'  # put your redis host here
REDIS_PORT = 6379
REDIS_DB = 0
REDIS_PWD = ''  # password goes here
# redis data structure
REDIS_HASH_FORMAT = 'product_h'
# start redis client
redis_client = redis.StrictRedis(host=REDIS_HOST, 
                                 port=REDIS_PORT, 
                                 db=REDIS_DB, 
                                 password=REDIS_PWD)

In [None]:
INDEX_NAME = 'catalog'
ES_HOST = {"host": "elasticsearch", "port": 9200}  # change here if you're not using a local ES
es_client = Elasticsearch(hosts=[ES_HOST])

### Products

_First of all, get products from the catalogue dump into a usable form_

In [None]:
def get_products_from_catalogue(catalog_file):
    """
    parse catalogue file into a map SKU -> properties (sku, name, target, image url)
    """
    products = {}
    with open(catalog_file) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['sku'] and row['image'].endswith('.jpg'):
                products[row['sku']] = row
    
    return products

In [None]:
products = get_products_from_catalogue(CATALOGUE_FILE)
print('{} products in catalog!'.format(len(products)))

### Embeddings

#### First, word embeddings, where it all started

In [None]:
def train_embedding_model(training_data):
    """
    training_data is a list of lists (list of words, products, etc.)
    
    """
    # train model with standard params
    model = gensim.models.Word2Vec(training_data,
                                   min_count=min(len(training_data), 10),
                                   size=100,
                                   workers=4,
                                   window=3,
                                   iter=20)
    vectors = model.wv
    # remove model from memory
    del model
    
    # return vectors as TOKEN -> VECTOR map
    return vectors


def run_pca(items, dims):
    pca = PCA(n_components=dims)
    pca_result = pca.fit_transform(items)
    exp_variance = pca.explained_variance_ratio_
    # print("PCA shape {}".format(pca_result.shape))
    # print('Explained variation per component: {}'.format(exp_variance))
    print('Cum. exp. var. for {} principal components: {}'.format(dims, np.sum(exp_variance)))

    return pca_result

In [None]:
def get_sentences_from_corpus(corpus_file, max_sentences=None):
    """
        Read the text file and process it as a list of lists, where each list is 
        the tokens in a sentence. Don't care too much about pre-processing,
        just get stuff done.
    """
    sentences = []
    with open(corpus_file) as c_file:
        for line in c_file:
            # remove punctuation, strip lines, lower case it and normalize spaces
            cleaned_line = ' '.join(line.translate(str.maketrans('', '', string.punctuation)).strip().lower().split())
            if not cleaned_line:
                continue
            sentences.append(cleaned_line.split())
            # check if we reached a max number of sentences for training
            if max_sentences and len(sentences) == max_sentences:
                return sentences
    
    return sentences

In [None]:
training_sentences_data = get_sentences_from_corpus(TEXT_FILE, max_sentences=2000000)
print('Total sentences: {}, first is: {}'.format(len(training_sentences_data), training_sentences_data[0]))
word_embeddings = train_embedding_model(training_sentences_data)

_Playing with similarities and analogies here_

In [None]:
for _ in ['paris', 'france']:
    print('###{}\n{}\n'.format(_, word_embeddings.most_similar_cosmul(positive=[_])))

In [None]:
def solve_vector_analogy(vectors, man, king, women):
    # MAN : KING = WOMAN : ? -> QUEEN
    return vectors.most_similar_cosmul(positive=[king, women], negative=[man])

In [None]:
print("BOY : KING = WOMAN : {}\n".format(solve_vector_analogy(word_embeddings, 'boy', 'king', 'girl')[0][0]))
print("PARIS : FRANCE = BERLIN : {}\n".format(solve_vector_analogy(word_embeddings, 'paris', 'france', 'berlin')[0][0]))

#### Now, one more time, with product data this time

In [None]:
def get_products_from_sessions(session_file):
    """
        Our file from the analytics service conveniently dumps, line by line,
        user sessions. We just read the file and return a list of lists!
        
        Every line is:
        
        LINE_ID (as INT) TAB PRODUCT 1 TAB PRODUCT 2 ...
        
        P.s.: our file has been processed to include only session with length >= 3 and < 200
    """
    sessions = []
    with open(session_file) as session_f:
        for line in session_f:
            products = line.strip().split('\t')[1:]
            sessions.append(products)
        
    return sessions

In [None]:
training_session_data = get_products_from_sessions(SESSION_FILE)
print('Total sessions: {}, first is: {}'.format(len(training_session_data), training_session_data[0]))
product_embeddings = train_embedding_model(training_session_data)

_Check item-item similarity by looking at product vectors close together in the space_

In [None]:
TEST_PRODUCT = ''  # fill here with the product ID you want to test

matches = product_embeddings.most_similar_cosmul(positive=[TEST_PRODUCT])
# display top N
for m in matches[:3]:
    display(Image(products[m[0]]['image'], width=150, unconfined=True))

_Playing with some analogies here_

In [None]:
# fill here with your product IDs to test analogies
PRODUCT1 = ''
PRODUCT1_MATCH = ''
PRODUCT2 = ''

assert all(_ in product_embeddings.vocab for _ in [PRODUCT1, PRODUCT1_MATCH, PRODUCT2])

In [None]:
matches = solve_vector_analogy(product_embeddings, PRODUCT1, PRODUCT1_MATCH, PRODUCT2)
# first show products
for _ in [PRODUCT1, PRODUCT1_MATCH, PRODUCT2]:
    display(Image(products[_]['image'], width=100, unconfined=True))
# then display matches
for m in matches[:1]:
    if m[0] in products:
        display(Image(products[m[0]]['image'], width=100, unconfined=True))

_Finally, we apply pca to reduce vector size and then add them in our product dictionary_

In [None]:
all_products = [p for p in list(product_embeddings.vocab)]
pca_vectors = run_pca([product_embeddings[p] for p in all_products], dims=EMBEDDING_DIMS)
sku2vector = {all_products[idx]: list(pca_vectors[idx]) for idx in range(0, len(all_products))}
# add vector to products
for sku, p in products.items():
    p['vector'] = sku2vector.get(p['sku'], None)
    p['popularity'] = random.randint(0, 100)  # add a popularity field to fake popularity data for later retrieval
# debug
print(products[PRODUCT1]['vector'])
# remove products without vectors for simplicity
products = {k: v for k,v in products.items() if v['vector']}
len(products)

### Personalizing search

#### Load data into ES

In [None]:
def upload_docs_to_es(index_name, docs):
    """
    index_name is a string 
    docs is a map doc id -> doc as a Python dictionary (in our case SKU -> product)
    """
    # first we delete an index with the same name if any 
    # ATTENTION: IF YOU USE THIS CODE IN THE REAL WORLD THIS LINE WILL DELETE THE INDEX
    if es_client.indices.exists(index_name):
        print("Deleting {}".format(index_name))
        es_client.indices.delete(index=index_name)    
    # next we define our index
    body = {
        'settings': {
            "number_of_shards" : 1,
            "number_of_replicas" : 0
        },
        "mappings": {
          "properties": {
                "name": { "type": "text", "analyzer": LANGUAGE },
                "target": { "type": "text", "analyzer": LANGUAGE },
                "image": { "type": "text", "analyzer": LANGUAGE },
                "vector": {
                      "type": "dense_vector",
                      "dims": EMBEDDING_DIMS
                    }
                } 
        }
    }
    # create index
    res = es_client.indices.create(index=index_name, body=body)
    # finally, we bulk upload the documents
    actions = [{
                   "_index": index_name,
                   "_id": sku,
                   "_source": doc
               } for sku, doc in docs.items()
            ]
    # bulk upload
    res = helpers.bulk(es_client, actions)
    
    return res

def query_and_display_results(index_name, search_query, docs, n=5):
    res = es_client.search(index=index_name, body=search_query)
    print("Total hits: {}\n".format(res['hits']['total']['value']))
    for hit in res['hits']['hits'][:n]:
        print('{}\n'.format(hit["_source"]['sku']))
        if hit["_source"]['sku'] in docs:
            display(Image(docs[hit["_source"]['sku']]['image'], width=150, unconfined=True))

In [None]:
upload_result = upload_docs_to_es(INDEX_NAME, products)
upload_result

In [None]:
es_client.indices.refresh(INDEX_NAME)
resp = es_client.get(index=INDEX_NAME, id=PRODUCT1)
resp

#### Load data into our redis cache, to simulate a real-time use case

In [None]:
def redis_upload(redis_client, rows):
    with redis_client.pipeline() as pipe:
        for r in rows:
            pipe.hset(REDIS_HASH_FORMAT, r['sku'], json.dumps(r))
        res = pipe.execute()
    
    return

def load_vectors_to_cache(products, batch_size):
    # first we flush the cache
    # ATTENTION: IF YOU USE THIS CODE IN THE REAL WORLD THIS LINE WILL DELETE ALL DATA
    redis_client.flushall()
    # upload data in bulk with pipeline
    rows = list(products.values())
    for i in range(0, len(rows), batch_size):
        print("Uploading {} rows {} at {}...".format(len(rows), i, datetime.utcnow()))
        redis_upload(redis_client, rows[i: i + batch_size])
    # do some test
    print(redis_client.hmget(REDIS_HASH_FORMAT, [r['sku'] for r in rows[:3]]))
    #return total number of rows uploaded
    return len(rows)

In [None]:
load_vectors_to_cache(products, batch_size=2000)

#### Query ES

_First, we query ES for a "vanilla" search_

In [None]:
search_query = {
    "query" : {
        "script_score" : {
            "query": {
                    "match" : {
                        "name" : {
                            "query" : QUERY1
                        }
                    }
                },
            "script": {
              "source" : "doc['popularity'].value / 10"
            }
        }
     }
}
query_and_display_results(INDEX_NAME, search_query, products, n=5)

_Now, we retrieve from Redis the vectors for products in the session_

In [None]:
def build_average_vector(vectors, v_shape):
    """
    not exactly fancy, but... 
    see for example https://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis
    """
    category_vec = np.zeros(v_shape[0]).reshape(v_shape)
    count = 0.
    for v in vectors:
        assert v.shape == category_vec.shape
        try:
            category_vec += v
            count += 1.
        except KeyError:
            continue
    if count != 0:
        category_vec/= count
        
    return category_vec

In [None]:
def retrieve_session_vector_from_redis(products_in_session):
    session_products = redis_client.hmget(REDIS_HASH_FORMAT, products_in_session)
    session_vectors = [np.array(json.loads(s)["vector"]) for s in session_products]
    
    return build_average_vector(session_vectors, session_vectors[0].shape)

In [None]:
session_vector = retrieve_session_vector_from_redis(PRODUCTS_IN_SESSION)
# debug
len(session_vector)

_(if you don't want to setup Redis, just use the map in memory to retrieve the vectors - uncomment below)_

In [None]:
def retrieve_session_vector_from_memory(products_in_session):
    session_vectors = [np.array(products[p]['vector']) for p in products_in_session]
    
    return build_average_vector(session_vectors, session_vectors[0].shape)

# session_vector = retrieve_session_vector_from_memory(products_in_session)

_Finally use the session vector to query_

In [None]:
vector_query = {
    "query" : {
        "script_score" : {
            "query": {
                    "match" : {
                        "name" : {
                            "query" : QUERY1
                        }
                    }
                },
            "script": {
              "source": "cosineSimilarity(params.query_vector, doc['vector']) + 1.0",
              "params": {"query_vector": session_vector.tolist()}
            }
        }
     }
}
query_and_display_results(INDEX_NAME, vector_query, products, n=5)

_Try some other query_

In [None]:
# vanilla query
search_query = {
    "query" : {
        "script_score" : {
            "query": {
                    "match" : {
                        "name" : {
                            "query" : QUERY2
                        }
                    }
                },
            "script": {
              "source" : "doc['popularity'].value / 10"
            }
        }
     }
}
query_and_display_results(INDEX_NAME, search_query, products, n=5)
# now personalized
vector_query = {
    "query" : {
        "script_score" : {
            "query": {
                    "match" : {
                        "name" : {
                            "query" : QUERY2
                        }
                    }
                },
            "script": {
              "source": "cosineSimilarity(params.query_vector, doc['vector']) + 1.0",
              "params": {"query_vector": session_vector.tolist()}
            }
        }
     }
}
query_and_display_results(INDEX_NAME, vector_query, products, n=5)

### Appendix: how to visualize vectors and impress friends

In [None]:
def visualize_word_embeddings_tsne(word_embeddings):
    # colors
    colors = ['red', 'green', 'blue', 'purple', 'yellow', 'black']
    interesting_word_groups = [
        (['he', 'she', 'it', 'they', 'i', 'you', 'we'], 'pronouns'),
        (['london', 'paris', 'berlin', 'budapest', 'amsterdam', 'prague', 'rome'], 'cities'),
        (['italy', 'germany', 'spain', 'romania', 'finland', 'poland', 'norway', 'sweden', 'austria', 'brazil'], 'countries'),
        (['pasta', 'pizza', 'steak', 'pie', 'fries', 'burger', 'salmon'], 'food'),
        (['john', 'mark', 'jane', 'jessica', 'donald', 'simon'], 'names'),
        ([random.choice(list(word_embeddings.vocab)) for _ in range(0, 100)], 'other')
    ]
    all_words = []
    for words, group in interesting_word_groups:
        for w in words:
            all_words.append(w)
    all_keys = [w for w in list(word_embeddings.vocab) if w in all_words]
    print(len(all_keys))
    all_vectors = [word_embeddings[e] for e in all_keys]
    # get projection
    X_embedded = TSNE(n_components=2).fit_transform(all_vectors)
    word_2_emb = {k: e for k, e in zip(all_keys, X_embedded)}
    print(len(all_vectors), X_embedded.shape)
    # divide groups
    data = []
    groups = []
    for words, group in interesting_word_groups:
        groups.append(group)
        data.append([word_2_emb[w] for w in words])
    print(groups, data[0])
    # create plot
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(1, 1, 1)
    # add groups
    for d, color, group in zip(data, colors, groups):
        x = [_[0] for _ in d]
        y = [_[1] for _ in d]
        ax.scatter(x, y, alpha=0.8, c=color, edgecolors='none', s=30, label=group)
    # show plot
    plt.title('Plot color-coded embeddings')
    plt.legend(loc=2)
    plt.show()
    
    return

visualize_word_embeddings_tsne(word_embeddings)

In [None]:
def export_vectors_for_projector_visualization(product_2_vectors,
                                               product_2_label,
                                               target_folder):
    # map dictionary to list to preserve order when exporting
    all_p = [p for p in list(product_2_vectors.vocab) if p in product_2_label]
    all_v = [product_2_vectors[p] for p in all_p]
    # write vectors
    with open(os.path.join(target_folder, 'vectors.tsv'), 'w') as v_f:
        for idx in range(0, len(all_v)):
            v_f.write('{}\n'.format('\t'.join(['{:.5f}'.format(_) for _ in pca_res[idx]])))
    # if avalaible, labels can be paired with SKUs for visualization purposes
    # if a mapping is specified, we produce a "meta" file, otherwise we just return
    if not product_2_label:
        return
    # write meta if mapping is available
    with open(os.path.join(target_folder, 'meta.tsv', 'w')) as m_f:
        # header
        m_f.write('sku\tlabel\n')
        for sku in all_p:
            m_f.write('{}\t{}\n'.format(sku, product_2_label[sku]))

    return