### Import and global variables

In [None]:
import os
import csv
from datetime import datetime
import string
import json
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import redis  # to communicate with redis
import gensim # to talk to gensim
from IPython.display import Image  # to display URL in noteboook for visual debug
from IPython.core.display import display # to display URL in noteboook for visual debug
from elasticsearch import Elasticsearch, helpers # remember to !pip install elasticsearch

_Input file variables_

In [None]:
DATA_FOLDER = ''  # local folder here
CATALOGUE_FILE = os.path.join(DATA_FOLDER, 'catalog.csv')
TEXT_FILE = os.path.join(DATA_FOLDER, 'corpus.txt')  # texts from 1BN words dataset
SESSION_FILE = os.path.join(DATA_FOLDER, 'sessions.txt') # file with session data (pre-filtered for length and pre-formatted)

_Query variables_

In [None]:
LANGUAGE = ''  # put here the ES compatible language string (depending on the language of your catalog/search queries)
QUERY1 = '' # put here the first query to test
QUERY2 = '' # put here the second query to test
TOP_N = 50 # top N results to re-rank

_Model variables_

In [None]:
EMBEDDING_DIMS = 50 # specify embedding size

_Product variables_

In [None]:
PRODUCTS_IN_SESSION = [''] # list of product ID the user visited in the present session
TEST_PRODUCT = ''  # fill here with the product ID you want to test for similarities
# fill here with your product IDs to test for analogies
PRODUCT1 = ''
PRODUCT1_MATCH = ''
PRODUCT2 = ''

### Python clients for Redis and ES

In [None]:
# redis credentials here!
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_DB = 0
REDIS_PWD = None
# redis data structure
REDIS_HASH_FORMAT = 'product_h'
# start redis client
redis_client = redis.StrictRedis(host=REDIS_HOST, 
                                 port=REDIS_PORT, 
                                 db=REDIS_DB, 
                                 password=REDIS_PWD)

In [None]:
INDEX_NAME = 'catalog'
ES_HOST = {"host": "localhost", "port": 9200}
es_client = Elasticsearch(hosts=[ES_HOST])

### Products

_First of all, get products from the catalogue dump into a usable form_

In [None]:
def get_products_from_catalogue(catalog_file):
    """
    parse catalogue file into a map SKU -> properties (sku, name, target, image url)
    """
    products = {}
    with open(catalog_file) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['sku'] and row['image'].endswith('.jpg'):
                products[row['sku']] = row
    
    return products

In [None]:
products = get_products_from_catalogue(CATALOGUE_FILE)
print('{} products in catalog!'.format(len(products)))

### Embeddings

#### First, word embeddings, where it all started

In [None]:
def train_embedding_model(training_data):
    """
    training_data is a list of lists (list of words, products, etc.)
    
    """
    # train model with standard params
    model = gensim.models.Word2Vec(training_data,
                                   min_count=10,
                                   size=EMBEDDING_DIMS,
                                   workers=4,
                                   window=3,
                                   iter=20)
    vectors = model.wv
    # remove model from memory
    del model
    
    # return vectors as TOKEN -> VECTOR map
    return vectors

In [None]:
def get_sentences_from_corpus(corpus_file, max_sentences=None):
    """
        Read the text file and process it as a list of lists, where each list is 
        the tokens in a sentence. Don't care too much about pre-processing,
        just get stuff done.
    """
    sentences = []
    with open(corpus_file) as c_file:
        for line in c_file:
            # remove punctuation, strip lines, lower case it and normalize spaces
            cleaned_line = ' '.join(line.translate(str.maketrans('', '', string.punctuation)).strip().lower().split())
            if not cleaned_line:
                continue
            sentences.append(cleaned_line.split())
            # check if we reached a max number of sentences for training
            if max_sentences and len(sentences) == max_sentences:
                return sentences
    
    return sentences

In [None]:
training_sentences_data = get_sentences_from_corpus(TEXT_FILE, max_sentences=2000000)
print('Total sentences: {}, first is: {}'.format(len(training_sentences_data), training_sentences_data[0]))
word_embeddings = train_embedding_model(training_sentences_data)

_Playing with similarities and analogies here_

In [None]:
for _ in ['paris', 'france']:
    print('###{}\n{}\n'.format(_, word_embeddings.most_similar_cosmul(positive=[_])))

In [None]:
def solve_vector_analogy(vectors, man, king, women):
    # MAN : KING = WOMAN : ? -> QUEEN
    return vectors.most_similar_cosmul(positive=[king, women], negative=[man])

In [None]:
print("BOY : KING = WOMAN : {}\n".format(solve_vector_analogy(word_embeddings, 'boy', 'king', 'girl')[0][0]))
print("PARIS : FRANCE = BERLIN : {}\n".format(solve_vector_analogy(word_embeddings, 'paris', 'france', 'berlin')[0][0]))

#### Now, one more time, with product data this time

In [None]:
def get_products_from_sessions(session_file):
    """
        Our file from the analytics service conveniently dumps, line by line,
        user sessions. We just read the file and return a list of lists!
        
        Every line is:
        
        LINE_ID (as INT) TAB PRODUCT 1 TAB PRODUCT 2 ...
        
        P.s.: our file has been pre-processed to include only session with length >= 3 and < 200
    """
    sessions = []
    with open(session_file) as session_f:
        for line in session_f:
            products = line.strip().split('\t')[1:]
            sessions.append(products)
        
    return sessions

In [None]:
training_session_data = get_products_from_sessions(SESSION_FILE)
print('Total sessions: {}, first is: {}'.format(len(training_session_data), training_session_data[0]))
product_embeddings = train_embedding_model(training_session_data)

_Check item-item similarity by looking at product vectors close together in the space_

In [None]:
matches = product_embeddings.most_similar_cosmul(positive=[TEST_PRODUCT])
# display top N
for m in matches[:3]:
    display(Image(products[m[0]]['image'], width=150, unconfined=True))

_Playing with some analogies here_

In [None]:
assert all(_ in product_embeddings.vocab for _ in [PRODUCT1, PRODUCT1_MATCH, PRODUCT2])

In [None]:
matches = solve_vector_analogy(product_embeddings, PRODUCT1, PRODUCT1_MATCH, PRODUCT2)
# first show products
for _ in [PRODUCT1, PRODUCT1_MATCH, PRODUCT2]:
    display(Image(products[_]['image'], width=100, unconfined=True))
# then display matches
for m in matches[:1]:
    if m[0] in products:
        display(Image(products[m[0]]['image'], width=100, unconfined=True))

_Finally, we add the vectors to our product dictionary_

In [None]:
# add vector to products
for sku, p in products.items():
    p['vector'] = product_embeddings[p['sku']].tolist() if p['sku'] in product_embeddings else None
    p['popularity'] = random.randint(0, 100)  # add a popularity field to fake popularity data for later retrieval
# remove products without vectors for simplicity
products = {k: v for k,v in products.items() if v['vector'] is not None}
len(products)

### Personalizing search

In [None]:
def re_rank_results(session_vector, skus):
    results_vectors = retrieve_vectors_from_redis(skus)
    distance_matrix = cosine_similarity(session_vector.reshape(1, -1), results_vectors)[0]
    so = np.argsort(distance_matrix)
    return list(reversed(list(np.array(skus)[so])))

#### Load data into ES

In [None]:
def upload_docs_to_es(index_name, docs):
    """
    index_name is a string 
    docs is a map doc id -> doc as a Python dictionary (in our case SKU -> product)
    """
    # first we delete an index with the same name if any 
    # ATTENTION: IF YOU USE THIS CODE IN THE REAL WORLD THIS LINE WILL DELETE THE INDEX
    if es_client.indices.exists(index_name):
        print("Deleting {}".format(index_name))
        es_client.indices.delete(index=index_name)    
    # next we define our index
    body = {
        'settings': {
            "number_of_shards" : 1,
            "number_of_replicas" : 0
        },
        "mappings": {
          "properties": {
                "name": { "type": "text", "analyzer": LANGUAGE },
                "target": { "type": "text", "analyzer": LANGUAGE },
                "image": { "type": "text", "analyzer": LANGUAGE } ,
                "vector": {
                      "type": "dense_vector",
                      "dims": EMBEDDING_DIMS
                    }
            }
        }
    }
    # create index
    res = es_client.indices.create(index=index_name, body=body)
    # finally, we bulk upload the documents
    actions = [{
                   "_index": index_name,
                   "_id": sku,
                   "_source": doc
               } for sku, doc in docs.items()
            ]
    # bulk upload
    res = helpers.bulk(es_client, actions)
    
    return res

def query_with_es(index_name, search_query, n=5):
    search_query = {
        "from": 0,
        "size": n,
        "query" : {
            "script_score" : {
                "query": {
                        "match" : {
                            "name" : {
                                "query" : search_query
                            }
                        }
                    },
                "script": {
                  "source" : "doc['popularity'].value / 10"
                }
            }
         }
    }
    res = es_client.search(index=index_name, body=search_query)
    print("Total hits: {}, returned {}\n".format(res['hits']['total']['value'], len(res['hits']['hits'])))
    return [(hit["_source"]['sku'], hit["_source"]['image']) for hit in res['hits']['hits']]

def query_and_display_results_with_es(index_name, search_query, n=5):
    res = query_with_es(index_name, search_query, n=n)
    return display_image(res)

def display_image(skus, n=5):
    for (s, image) in skus[:n]:
        print('{} - {}\n'.format(s, image))
        display(Image(image, width=150, unconfined=True))
            
def query_and_rerank_and_display_results_with_es(index_name, search_query, n, session_vector):
    res = query_with_es(index_name, search_query, n=n)
    skus = [r[0] for r in res]
    re_ranked_sku = re_rank_results(session_vector, skus)

    return display_image([(sku, res[skus.index(sku)][1]) for sku in re_ranked_sku])

In [None]:
upload_result = upload_docs_to_es(INDEX_NAME, products)
upload_result

In [None]:
es_client.indices.refresh(INDEX_NAME)
resp = es_client.get(index=INDEX_NAME, id=PRODUCT1)
print(resp)

#### Load data into Coveo

In [None]:
def upload_docs_to_coveo(index_name, docs):
    # TODO: upload data to coveo
    return None

def query_with_coveo(index_name, search_query, n=5):
    # TODO: query coveo and return a list of tuple (sku, image_url)
    return # [(sku1, image1), (sku2, image2), ...]

def query_and_display_results_with_coveo(index_name, search_query, n=5):
    res = query_with_coveo(index_name, search_query, n=n)
    return display_image(res)
       
def query_and_rerank_and_display_results_with_coveo(index_name, search_query, n, session_vector):
    res = query_with_coveo(index_name, search_query, n=n)
    return re_rank_results(session_vector, res)

In [None]:
# uncomment here if you want to upload data to coveo as well
# upload_result = upload_docs_to_coveo(INDEX_NAME, products)

#### Load data into Redis, to simulate a real-time use case

In [None]:
def redis_upload(redis_client, rows):
    with redis_client.pipeline() as pipe:
        for r in rows:
            pipe.hset(REDIS_HASH_FORMAT, r['sku'], json.dumps(r))
        res = pipe.execute()
    
    return

def load_vectors_to_cache(products, batch_size):
    # first we flush the cache
    # ATTENTION: IF YOU USE THIS CODE IN THE REAL WORLD THIS LINE WILL DELETE ALL DATA
    redis_client.flushall()
    # upload data in bulk with pipeline
    rows = list(products.values())
    for i in range(0, len(rows), batch_size):
        print("Uploading {} rows {} at {}...".format(len(rows), i, datetime.utcnow()))
        redis_upload(redis_client, rows[i: i + batch_size])
    # do some test
    print(redis_client.hmget(REDIS_HASH_FORMAT, [r['sku'] for r in rows[:1]]))
    #return total number of rows uploaded
    return len(rows)

In [None]:
load_vectors_to_cache(products, batch_size=2000)

#### Query ES

_First, we query ES for a "vanilla" search_

In [None]:
query_and_display_results_with_es(INDEX_NAME, QUERY1, TOP_N)

In [None]:
# uncomment here if you like to use Coveo index instead
# query_and_display_results_with_coveo(INDEX_NAME, QUERY1, n=TOP_N)

_Now, we retrieve from Redis the vectors for products in the session_

In [None]:
def build_average_vector(vectors, v_shape):
    """
    not exactly fancy, but... 
    see for example https://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis
    """
    category_vec = np.zeros(v_shape[0]).reshape(v_shape)
    for v in vectors:
        assert v.shape == category_vec.shape
        category_vec += v
    
    return category_vec / len(vectors)

In [None]:
def retrieve_vectors_from_redis(skus):
    session_products = redis_client.hmget(REDIS_HASH_FORMAT, skus)
    return [np.array(json.loads(s)["vector"]) for s in session_products if s]

def retrieve_session_vector_from_redis(products_in_session):
    session_vectors = retrieve_vectors_from_redis(products_in_session)
    
    return build_average_vector(session_vectors, session_vectors[0].shape)

In [None]:
session_vector = retrieve_session_vector_from_redis(PRODUCTS_IN_SESSION)
# debug
print(len(session_vector), session_vector[:10])

_Finally use the session vector to query_

In [None]:
query_and_rerank_and_display_results_with_es(INDEX_NAME, QUERY1, TOP_N, session_vector)
# uncomment here if you like to use Coveo index instead
# query_and_rerank_and_display_results_with_coveo(INDEX_NAME, QUERY1, TOP_N, session_vector)

_Try some other query_

In [None]:
# vanilla query
query_and_display_results_with_es(INDEX_NAME, QUERY2, TOP_N)
# now personalized
query_and_rerank_and_display_results_with_es(INDEX_NAME, QUERY2, TOP_N, session_vector)

### Appendix: how to visualize vectors and impress friends

In [None]:
def visualize_word_embeddings_tsne(word_embeddings):
    # colors
    colors = ['red', 'green', 'blue', 'purple', 'yellow', 'black']
    interesting_word_groups = [
        (['he', 'she', 'it', 'they', 'i', 'you', 'we'], 'pronouns'),
        (['london', 'paris', 'berlin', 'budapest', 'amsterdam', 'prague', 'rome'], 'cities'),
        (['italy', 'germany', 'spain', 'romania', 'finland', 'poland', 'norway', 'sweden', 'austria', 'brazil'], 'countries'),
        (['pasta', 'pizza', 'steak', 'pie', 'fries', 'burger', 'salmon'], 'food'),
        (['john', 'mark', 'jane', 'jessica', 'donald', 'simon'], 'names'),
        ([random.choice(list(word_embeddings.vocab)) for _ in range(0, 100)], 'other')
    ]
    all_words = []
    for words, group in interesting_word_groups:
        for w in words:
            all_words.append(w)
    all_keys = [w for w in list(word_embeddings.vocab) if w in all_words]
    all_vectors = [word_embeddings[e] for e in all_keys]
    # get projection
    X_embedded = TSNE(n_components=2).fit_transform(all_vectors)
    word_2_emb = {k: e for k, e in zip(all_keys, X_embedded)}
    # divide groups
    data = []
    groups = []
    for words, group in interesting_word_groups:
        groups.append(group)
        data.append([word_2_emb[w] for w in words])
    print(groups, data[0])
    # create plot
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(1, 1, 1)
    # add groups
    for d, color, group in zip(data, colors, groups):
        x = [_[0] for _ in d]
        y = [_[1] for _ in d]
        ax.scatter(x, y, alpha=0.8, c=color, edgecolors='none', s=30, label=group)
    # show plot
    plt.title('Plot color-coded embeddings')
    plt.legend(loc=2)
    plt.show()
    
    return

visualize_word_embeddings_tsne(word_embeddings)

In [None]:
def export_vectors_for_projector_visualization(product_2_vectors,
                                               product_2_label,
                                               target_folder):
    # map dictionary to list to preserve order when exporting
    all_p = [p for p in list(product_2_vectors.vocab) if (not product_2_label or p in product_2_label)]
    all_v = [product_2_vectors[p] for p in all_p]
    # write vectors
    with open(os.path.join(target_folder, 'vectors.tsv'), 'w') as v_f:
        for v in all_v:
            v_f.write('{}\n'.format('\t'.join(['{:.5f}'.format(_) for _ in v])))
    # if avalaible, labels can be paired with SKUs for visualization purposes
    # if a mapping is specified, we produce a "meta" file, otherwise we just return
    if not product_2_label:
        return
    # write meta if mapping is available
    with open(os.path.join(target_folder, 'meta.tsv', 'w')) as m_f:
        # header
        m_f.write('sku\tlabel\n')
        for sku in all_p:
            m_f.write('{}\t{}\n'.format(sku, product_2_label[sku]))

    return