### import modules

In [None]:
!pip install annoy  # Locality Sensitive Hashing https://github.com/spotify/annoy
!pip3 install tensorflow_text>=2.0.0

Collecting annoy
[?25l  Downloading https://files.pythonhosted.org/packages/a1/5b/1c22129f608b3f438713b91cd880dc681d747a860afe3e8e0af86e921942/annoy-1.17.0.tar.gz (646kB)
[K     |████████████████████████████████| 655kB 5.4MB/s 
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.0-cp36-cp36m-linux_x86_64.whl size=390360 sha256=256b6d8d74db10e434ed62e70735a02c4b9b6b407a9c84fd9028abe526e8257e
  Stored in directory: /root/.cache/pip/wheels/3a/c5/59/cce7e67b52c8e987389e53f917b6bb2a9d904a03246fadcb1e
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.0


In [None]:
from annoy import AnnoyIndex
import gensim
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import numpy as np
import pandas as pd
import re
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import tensorflow_hub as hub
import tensorflow_text
from time import process_time

stops = open('/content/drive/MyDrive/Colab Notebooks/russian.txt').read().split()

# Data & Metrics
estimate by top-10 search result

In [None]:
def open_data(size):
    y_train, x_train, y_test, x_test = open(f'/content/drive/MyDrive/Colab Notebooks/data_format/data_{size}.txt', encoding='utf-8').read().split('\n&&&\n')
    y_train, x_train, y_test, x_test = y_train.split('\n'), x_train.split('\n'), y_test.split('\n'), x_test.split('\n')
    return y_train, x_train, y_test, x_test

In [None]:
def ap(relev, k):    # average presicion
    ap = []
    for i in range(1, k):
        if relev[i-1] is 1:    # if doc is relevant
            ap.append(sum(relev[:i])/i)    # summary of precisions
    try: 
        ap = sum(ap)/sum(relev)
    except ZeroDivisionError:
        ap = 0.0
    return ap

def evaluation(query, relev, index=0, k=10):    # mean average precision (10)
    prec = round(sum(relev)/k, 4)    # rank is not take into account
    k += 1
    avp = ap(relev, k)    # rank is take into account
    evaluat = pd.DataFrame({'precision': prec,
                            'average_precision': avp},
                            index=[index])
    return evaluat

def retrieval(fit, predict, y_train, x_train, y_test, x_test):
    df = pd.DataFrame(columns=['query', 'precision', 'average_precision'])
    train_start = process_time()
    x = fit(x_train)
    train_stop = process_time()
    train_time = train_stop - train_start

    test_start = process_time()
    for index, query in enumerate(x_test):
        predictions = predict(x, query, y_train)
        relev = [0] * 10
        for i, pred in enumerate(predictions):
            if pred[1] == y_test[index]:
                relev[i] = 1
        if relev != [0] * 10:
            df = df.append(evaluation(query, relev, index))
        else:
            df2 = pd.DataFrame({'precision': 0.0,
                                'average_precision': 0.0},
                                index=[index])
            df = df.append(df2)
    test_stop = process_time()
    test_time = test_stop - test_start
    return df, train_time, test_time
# map = sum(ap)/Q    # Q - number of quaries
# recall = sum(p)/Q
MAP = pd.DataFrame(columns=[
  'embedding',
  'train size',
  'test size', 
  'training time', 
  'inference time', 
  'recall', 
  'MAP'
])

# Baseline: Bag of Words

In [None]:
vectorizers = [CountVectorizer(stop_words=stops)] * 2  # 100, 500
vectorizers += [CountVectorizer(max_df=0.7, min_df=0.1, max_features=15000, stop_words=stops)] * 6  # 837, 1000, 5000, 10000, 20000, 30000

# fitting
def CV_fit(x_train):    
    X = vectorizer.fit_transform(x_train)
    return X.toarray()

# similarity
def CV_predict(x, query, y_train):
    pred = []
    vec = vectorizer.transform([query]).toarray()
    simil = []
    for vector, film in zip(x, y_train):
        simil.append([1 - spatial.distance.cosine(vector, vec), film])
    simil.sort(reverse=True)
    for sim in simil[:10]:
        pred.append(sim)
    return pred

for i, size, vectorizer in zip((0, 1, 2, 3, 4, 5, 6, 7),
                               (100, 500, 837, 1000, 5000, 10000, 20000, 30000),
                               vectorizers):
    y_train, x_train, y_test, x_test = open_data(size)
    df, train_time, test_time = retrieval(CV_fit, CV_predict, y_train, x_train, y_test, x_test)
    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'CountVectorizer',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = MAP.append(df2)
    MAP.to_csv('/content/drive/MyDrive/Colab Notebooks/MAP.csv', index=False)

done 100
done 500


  dist = 1.0 - uv / np.sqrt(uu * vv)


done 837
done 1000
done 5000
done 10000
done 20000
done 30000


# TF-IDF

In [None]:
vectorizers = [TfidfVectorizer(stop_words=stops)] * 2  # 100, 500
vectorizers += [TfidfVectorizer(max_df=0.7, min_df=0.1, max_features=15000, stop_words=stops)] * 6  # 837, 1000, 5000, 10000, 20000, 30000

# fitting
def tfidf_fit(x_train):
    X = vectorizer.fit_transform(x_train)
    return X.toarray()

# similarity
def tfidf_predict(x, query, y_train):
    pred = []
    vec = vectorizer.transform([query]).toarray()
    simil = []
    for vector, film in zip(x, y_train):
        simil.append([1 - spatial.distance.cosine(vector, vec), film])
    simil.sort(reverse=True)
    for sim in simil[:10]:
        pred.append(sim)
    return pred


for i, size, vectorizer in zip((8, 9, 10, 11, 12, 13, 14, 15),
                               (100, 500, 837, 1000, 5000, 10000, 20000, 30000),
                               vectorizers):
    y_train, x_train, y_test, x_test = open_data(size)
    df, train_time, test_time  = retrieval(tfidf_fit, tfidf_predict, y_train, x_train, y_test, x_test)
    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'TF-IDF',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = MAP.append(df2)
    MAP.to_csv('/content/drive/MyDrive/Colab Notebooks/MAP.csv', index=False)

# Fasttext (pre-trained) & Locality Sensitive Hashing

In [None]:
def calc_map(query, film, predictions, k, index):
    relev = [0] * k
    for i, pred in enumerate(predictions):
        if y_train[pred] == film:
            relev[i] = 1
    if relev != [0] * k:   
        df = evaluation(query, relev, index, k)
    else:
        df = pd.DataFrame({'precision': 0.0,
                           'average_precision': 0.0},
                           index=[index])
    return df

In [None]:
model = gensim.models.KeyedVectors.load('/content/drive/MyDrive/Colab Notebooks/model.model') # tayga_none_fasttextcbow_300_10_2019

for i, size in zip((16, 17, 18, 19, 20, 21, 22, 23),
                   (100, 500, 837, 1000, 5000, 10000, 20000, 30000)):
    f = 300
    t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
    y_train, x_train, y_test, x_test = open_data(size)
    x_train = [re.findall('\w+', text) for text in x_train]
    x_test = [re.findall('\w+', text) for text in x_test]

    train_start = process_time()
    for idx, text in enumerate(x_train):
        vectors = []
        for word in text:
            try:
                vectors.append(model.__getitem__(word))
            except AttributeError:
                continue
        v = np.mean(vectors, axis=0)
        try:
            t.add_item(idx, v)
        except ValueError:
            continue
    train_stop = process_time()
    train_time = train_stop - train_start
    t.build(100)  # 100 trees

    df = pd.DataFrame(columns=['precision', 'average_precision'])
    test_start = process_time()
    for idx, (query, film) in enumerate(zip(x_test, y_test)):
        vectors = []
        for word in query:
            try:
                vectors.append(model.__getitem__(word))
            except AttributeError:
                continue
        v = np.mean(vectors, axis=0)
        top_10 = t.get_nns_by_vector(v, 10)  # find the 10 nearest neighbors
        df2 = calc_map(query, film, top_10, 10, idx)
        df = df.append(df2)
    test_stop = process_time()
    test_time = test_stop - test_start

    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'Fasttext',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = MAP.append(df2)
    MAP.to_csv('/content/drive/MyDrive/Colab Notebooks/MAP.csv', index=False)

done 100
done 500
done 837
done 1000


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


done 5000
done 10000
done 20000
done 30000


# Word2Vec (pre-trained) & Locality Sensitive Hashing

In [None]:
def open_data_tag(size):
    y_train, x_train, y_test, x_test = open(f'/content/drive/MyDrive/Colab Notebooks/data_format_tags/data_{size}.txt', encoding='utf-8').read().split('\n&&&\n')
    y_train, x_train, y_test, x_test = y_train.split('\n'), x_train.split('\n'), y_test.split('\n'), x_test.split('\n')

    for idx, text in enumerate(x_train):
        x_train[idx] = [word for word in text.split() if word in model.vocab]

    for idx, text in enumerate(x_test):
        x_test[idx] = [word for word in text.split() if word in model.vocab]

    return y_train, x_train, y_test, x_test

In [None]:
model = open('/content/drive/MyDrive/Colab Notebooks/model.bin', 'rb') # ruwikiruscorpora_upos_skipgram_300_2_2019
model = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True)

for i, size in zip((24, 25, 26, 27, 28, 29, 30, 31),
                   (100, 500, 837, 1000, 5000, 10000, 20000, 30000)):
    f = 300
    t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
    y_train, x_train, y_test, x_test = open_data_tag(size)
    train_start = process_time()
    for idx, text in enumerate(x_train):
        try:
            v = np.mean(model[text], axis=0)
        except ValueError:
            continue
        else:
            t.add_item(idx, v)
    train_stop = process_time()
    train_time = train_stop - train_start
    t.build(100)  # 100 trees

    df = pd.DataFrame(columns=['precision', 'average_precision'])
    test_start = process_time()
    for idx, (query, film) in enumerate(zip(x_test, y_test)):
        v = np.mean(model[query], axis=0)
        top_10 = t.get_nns_by_vector(v, 10) # find the 10 nearest neighbors
        df2 = calc_map(query, film, top_10, 10, idx)
        df = df.append(df2)
    test_stop = process_time()
    test_time = test_stop - test_start

    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'Word2Vec',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = MAP.append(df2)
    MAP.to_csv('/content/drive/MyDrive/Colab Notebooks/MAP.csv', index=False)

done 100
done 500
done 837
done 1000
done 5000
done 10000
done 20000
done 30000


# Doc2Vec

In [None]:
def feature_constructing(x_train, y_train):
    tagged_data = []
    for text, film  in zip(x_train, y_train):
        try:
            tagged_data.append(TaggedDocument(words=text, tags=[y_train.index(film)]))
        except AttributeError:
            continue
    return tagged_data


def doc2vec_retrieval(y_train, y_test, x_test):
    df = pd.DataFrame(columns=['precision', 'average_precision'])
    for index, query in enumerate(x_test):
        test_data = [word for word in re.findall(r"\w+", query.lower()) if word not in stops]
        v1 = model.infer_vector(test_data)
        similar_doc = model.docvecs.most_similar(positive=[v1], topn=10)
        pred = []
        for film, q in similar_doc:
            pred.append([q, y_train[int(film)]])
        relev = [0] * 10
        for i in range(10):
            if pred[i][1] == y_test[index]:
                relev[i] = 1
        if relev != [0] * 10:
            df = df.append(evaluation(query, relev, index))
        else:
            df2 = pd.DataFrame({'precision': 0.0,
                                'average_precision': 0.0},
                                index=[index])
            df = df.append(df2)
    return df

In [None]:
max_epochs = 10
vec_size = 50

for i, size in zip((32, 33, 34, 35, 36, 37, 38, 39),
                   (100, 500, 837, 1000, 5000, 10000, 20000, 30000)):
    model = Doc2Vec(vector_size=vec_size,
                    min_count=0.1,
                    epochs=max_epochs)
    y_train, x_train, y_test, x_test = open_data(size)
    tagged_data = feature_constructing(x_train, y_train)
    model.build_vocab(tagged_data)
    train_start = process_time()
    model.train(tagged_data, 
                total_examples=model.corpus_count, 
                epochs=model.epochs)
    train_stop = process_time()
    train_time = train_stop - train_start

    test_start = process_time()
    df = doc2vec_retrieval(y_train, y_test, x_test)
    test_stop = process_time()
    test_time = test_stop - test_start
    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'Doc2Vec',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = MAP.append(df2)
    MAP.to_csv('/content/drive/MyDrive/Colab Notebooks/MAP.csv', index=False)

done 100
done 500
done 837
done 1000
done 5000
done 10000
done 20000
done 30000


# Universal Sentence Encoder (pre-trained) & Locality Sensitive Hashing

In [None]:
model = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3')

for i, size in zip((46, 47),  # 40, 41, 42, 43, 44, 45, 
                   (20000, 30000)):  # 100, 500, 837, 1000, 5000, 10000, 
    f = 512
    t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
    y_train, x_train, y_test, x_test = open_data(size)
    train_start = process_time()
    for idx, text in enumerate(x_train):
        v = model(text)[0]
        try:
            t.add_item(idx, v)
        except ValueError:
            continue
    train_stop = process_time()
    train_time = train_stop - train_start

    t.build(100)  # 100 trees
    df = pd.DataFrame(columns=['precision', 'average_precision'])

    test_start = process_time()
    for idx, (query, film) in enumerate(zip(x_test, y_test)):
        v = model(query)[0]
        top_10 = t.get_nns_by_vector(v, 10) # find the 10 nearest neighbors
        df2 = calc_map(query, film, top_10, 10, idx)
        df = df.append(df2)
    test_stop = process_time()
    test_time = test_stop - test_start

    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'Universal Sentence Encoder',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = MAP.append(df2)
    MAP.to_csv('/content/drive/MyDrive/Colab Notebooks/MAP.csv', index=False)

done 20000
done 30000
