### import stuff

In [None]:
# Locality Sensitive Hashing https://github.com/spotify/annoy
!pip install annoy

In [None]:
!pip install gensim numpy pandas scipy sklearn tensorflow_hub tensorflow tensorflow-text sentence_transformers

In [None]:
import re
from time import process_time

import gensim
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import tensorflow_text
from annoy import AnnoyIndex
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sentence_transformers import SentenceTransformer
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm.notebook import tqdm
from transformers import BertTokenizer

stops = open(r'../input/data-format/russian.txt',
             encoding='utf-8').read().split()

# Data & Metrics
estimate by top-10 search result

In [None]:
def open_data(size):
    y_train, x_train, y_test, x_test = open(f'../input/data-test-100/data_{size}.txt', encoding='utf-8').read().split('\n&&&\n')
    y_train, x_train, y_test, x_test = y_train.split('\n'), x_train.split('\n'), y_test.split('\n'), x_test.split('\n')

    x_train_map = open(r'../input/raw-data/film_plots.txt', encoding='utf-8').read().split('\n')
    y_train_map = open(r'../input/raw-data/wiki_titles.txt', encoding='utf-8').read().split('\n')
    x_test_map = open(r'../input/raw-data/test_data.txt', encoding='utf-8').read().split('\n')
    y_test_map = open(r'../input/raw-data/test_titles.txt', encoding='utf-8').read().split('\n')
    mapping_dict = {}
    for text, title in zip(x_train, y_train):
        mapping_dict[text] = x_train_map[y_train_map.index(title)]
    for text, title in zip(x_test, y_test):
        mapping_dict[text] = x_test_map[y_test_map.index(title)]

    return y_train, x_train, y_test, x_test, mapping_dict

In [None]:
def ap(relev, k):    # average precision
    ap = []
    for i in range(1, k + 1):
        if relev[i-1] == 1:    # if doc is relevant
            ap.append(sum(relev[:i])/i)    # summary of precisions
    try:
        ap = sum(ap)/sum(relev)
    except ZeroDivisionError:
        ap = 0.0
    return ap

def evaluation(relev, index=0, k=10):    # mean average precision (10)
    prec = round(sum(relev)/k, 4)    # rank is not take into account
    avp = ap(relev, k)    # rank is take into account
    evaluat = pd.DataFrame({'recall': prec,
                            'average_precision': avp},
                            index=[index])
    return evaluat

def retrieval(fit, predict, y_train, x_train, y_test, x_test):
    y_test = [film.split(';') for film in y_test]
    df = pd.DataFrame(columns=['recall', 'average_precision'])
    train_start = process_time()
    x = fit(x_train)
    train_stop = process_time()
    train_time = train_stop - train_start

    test_start = process_time()
    for index, query in enumerate(x_test):
        predictions = predict(x, query, y_train)
        relev = [0] * 10
        for i, pred in enumerate(predictions):
            if pred[1] in y_test[index]:
                relev[i] = 1
        if relev != [0] * 10:
            df = pd.concat([df, (evaluation(relev, index))])
        else:
            df2 = pd.DataFrame({'recall': 0.0,
                                'average_precision': 0.0},
                                index=[index])
            df = pd.concat([df, df2])
    test_stop = process_time()
    test_time = test_stop - test_start
    return df, train_time, test_time
# map = sum(ap)/Q    # Q - number of quaries
# recall = sum(p)/Q
MAP = pd.DataFrame(columns=[
  'embedding',
  'train size',
  'test size',
  'training time',
  'inference time',
  'recall',
  'MAP'
])

In [None]:
def calc_map(films, predictions, k, index, y_train):
    films = films.split(';')
    relev = [0] * k
    for i, pred in enumerate(predictions):
        if y_train[pred] in films:
            relev[i] = 1
    if relev != [0] * k:
        df = evaluation(relev, index, k)
    else:
        df = pd.DataFrame({'precision': 0.0,
                           'average_precision': 0.0},
                           index=[index])
    return df

# Baseline: Bag of Words

In [None]:
vectorizers = [CountVectorizer(stop_words=stops)] * 4  # 100, 500, 1000, 5000
vectorizers += [CountVectorizer(max_features=30000, stop_words=stops)] * 3  # 10000, 20000, 30000

# fitting
def CV_fit(x_train):
    X = vectorizer.fit_transform(x_train)
    return X.toarray()

# similarity
def CV_predict(x, query, y_train):
    pred = []
    vec = vectorizer.transform([query]).toarray()
    simil = []
    for vector, film in zip(x, y_train):
        simil.append([1 - spatial.distance.cosine(vector, vec), film])
    simil.sort(reverse=True)
    for sim in simil[:10]:
        pred.append(sim)
    return pred

for i, size, vectorizer in zip((0, 1, 2, 3, 4, 5, 6),
                               (100, 500, 1000, 5000, 10000, 20000, 30000),
                               vectorizers):
    y_train, x_train, y_test, x_test, mapping_dict = open_data(size)
    df, train_time, test_time = retrieval(CV_fit, CV_predict, y_train, x_train, y_test, x_test)
    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'CountVectorizer',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.recall if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = pd.concat([MAP, df2])
    MAP.to_csv('MAP_new.csv', index=False)

# TF-IDF

In [None]:
vectorizers = [TfidfVectorizer(stop_words=stops)] * 4 # 100, 500, 1000, 5000
vectorizers += [TfidfVectorizer(max_features=30000, stop_words=stops)] * 3  # 10000, 20000, 30000

# fitting
def tfidf_fit(x_train):
    X = vectorizer.fit_transform(x_train)
    return X.toarray()

# similarity
def tfidf_predict(x, query, y_train):
    pred = []
    vec = vectorizer.transform([query]).toarray()
    simil = []
    for vector, film in zip(x, y_train):
        simil.append([1 - spatial.distance.cosine(vector, vec), film])
    simil.sort(reverse=True)
    for sim in simil[:10]:
        pred.append(sim)
    return pred

for i, size, vectorizer in zip((7, 8, 9, 10, 11, 12, 13),
                               (100, 500, 1000, 5000, 10000, 20000, 30000),
                               vectorizers):
    y_train, x_train, y_test, x_test, mapping_dict = open_data(size)
    df, train_time, test_time = retrieval(tfidf_fit, tfidf_predict, y_train, x_train, y_test, x_test)
    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'TF-IDF',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.recall if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = pd.concat([MAP, df2])
    MAP.to_csv('MAP_new.csv', index=False)

# Fasttext (pre-trained) & Locality Sensitive Hashing

In [None]:
import re

model = gensim.models.KeyedVectors.load('tayga_none_fasttextcbow_300_10_2019\model.model')  # tayga_none_fasttextcbow_300_10_2019

# for i, size in zip((14, 15, 16, 17, 18, 19, 20),
#                    (100, 500, 1000, 5000, 10000, 20000, 30000)):
for i, size in zip((14, 15), (100, 500)):
    f = 300
    t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
    y_train, x_train, y_test, x_test = open_data(size)
    x_train = [re.findall('\w+', text) for text in x_train]
    x_test = [re.findall('\w+', text) for text in x_test]

    train_start = process_time()
    for idx, text in enumerate(x_train):
        vectors = []
        for word in text:
            try:
                vectors.append(model.__getitem__(word))
            except AttributeError:
                continue
        v = np.mean(vectors, axis=0)
        try:
            t.add_item(idx, v)
        except ValueError:
            continue
    train_stop = process_time()
    train_time = train_stop - train_start
    t.build(100)  # 100 trees

    df = pd.DataFrame(columns=['precision', 'average_precision'])
    test_start = process_time()
    for idx, (query, films) in enumerate(zip(x_test, y_test)):
        vectors = []
        for word in query:
            try:
                vectors.append(model.__getitem__(word))
            except AttributeError:
                continue
        v = np.mean(vectors, axis=0)
        top_10 = t.get_nns_by_vector(v, 10)  # find the 10 nearest neighbors
        df2 = calc_map(films, top_10, 10, idx)
        df = pd.concat([df, df2])
    test_stop = process_time()
    test_time = test_stop - test_start

    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'Fasttext',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = pd.concat([MAP, df2])
    MAP.to_csv('MAP_new.csv', index=False)

# Word2Vec (pre-trained) & Locality Sensitive Hashing

In [None]:
def open_data_tag(size):
    y_train, x_train, y_test, x_test = open(f'../input/data-test-100-tags/data_{size}.txt', encoding='utf-8').read().split('\n&&&\n')
    y_train, x_train, y_test, x_test = y_train.split('\n'), x_train.split('\n'), y_test.split('\n'), x_test.split('\n')

    for idx, text in enumerate(x_train):
        x_train[idx] = [word for word in text.split() if word in model.key_to_index]

    for idx, text in enumerate(x_test):
        x_test[idx] = [word for word in text.split() if word in model.key_to_index]

    return y_train, x_train, y_test, x_test

In [None]:
model = open('../input/ruwikiruscorpora-upos-cbow-300-10-2021/model.bin', 'rb')  # ruwikiruscorpora_upos_cbow_300_10_2021
model = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True)

for i, size in zip((21, 22, 23, 24, 25, 26, 27),
                   (100, 500, 1000, 5000, 10000, 20000, 30000)):
    f = 300
    t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
    y_train, x_train, y_test, x_test = open_data_tag(size)
    train_start = process_time()
    for idx, text in enumerate(x_train):
        try:
            v = np.mean(model[text], axis=0)
        except ValueError:
            continue
        else:
            t.add_item(idx, v)
    train_stop = process_time()
    train_time = train_stop - train_start
    t.build(100)  # 100 trees

    df = pd.DataFrame(columns=['precision', 'average_precision'])
    test_start = process_time()
    for idx, (query, films) in enumerate(zip(x_test, y_test)):
        v = np.mean(model[query], axis=0)
        top_10 = t.get_nns_by_vector(v, 10)  # find the 10 nearest neighbors
        df2 = calc_map(films, top_10, 10, idx, y_train)
        df2 = pd.DataFrame({'precision': 0.0,
                            'average_precision': 0.0},
                           index=[idx])
        df = pd.concat([df, df2])
    test_stop = process_time()
    test_time = test_stop - test_start

    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'Word2Vec',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                       index=[i])
    MAP = pd.concat([MAP, df2])
    MAP.to_csv('MAP_new.csv', index=False)

# Doc2Vec

In [None]:
def feature_constructing(x_train, y_train):
    tagged_data = []
    for text, film  in zip(x_train, y_train):
        try:
            tagged_data.append(TaggedDocument(words=text, tags=[y_train.index(film)]))
        except AttributeError:
            continue
    return tagged_data


def doc2vec_retrieval(y_train, y_test, x_test):
    y_test = [films.split(';') for films in y_test]
    df = pd.DataFrame(columns=['precision', 'average_precision'])
    for index, query in enumerate(x_test):
        test_data = [word for word in re.findall(r"\w+", query.lower()) if word not in stops]
        v1 = model.infer_vector(test_data)
        similar_doc = model.dv.most_similar(positive=[v1], topn=10)
        pred = []
        for film, q in similar_doc:
            pred.append([q, y_train[int(film)]])
        relev = [0] * 10
        for i in range(10):
            if pred[i][1] in y_test[index]:
                relev[i] = 1
        if relev != [0] * 10:
            df = pd.concat([df, evaluation(relev, index)])
        else:
            df2 = pd.DataFrame({'precision': 0.0,
                                'average_precision': 0.0},
                                index=[index])
            df = pd.concat([df, df2])
    return df

In [None]:
max_epochs = 10
vec_size = 50

for i, size in zip((28, 29, 30, 31, 32, 33, 34),
                   (100, 500, 1000, 5000, 10000, 20000, 30000)):
    model = Doc2Vec(vector_size=vec_size,
                    min_count=0.1,
                    epochs=max_epochs)
    y_train, x_train, y_test, x_test, mapping_dict = open_data(size)
    tagged_data = feature_constructing(x_train, y_train)
    model.build_vocab(tagged_data)
    train_start = process_time()
    model.train(tagged_data, 
                total_examples=model.corpus_count, 
                epochs=model.epochs)
    train_stop = process_time()
    train_time = train_stop - train_start

    test_start = process_time()
    df = doc2vec_retrieval(y_train, y_test, x_test)
    test_stop = process_time()
    test_time = test_stop - test_start
    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'Doc2Vec',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = pd.concat([MAP, df2])
    MAP.to_csv('MAP_new.csv', index=False)

# Universal Sentence Encoder (pre-trained Transformer) & Locality Sensitive Hashing

In [None]:
model = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3')

for i, size in zip((35, 36, 37, 38, 39, 40, 41),
                   (100, 500, 1000, 5000, 10000, 20000, 30000)):
    f = 512
    t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
    y_train, x_train, y_test, x_test, mapping_dict = open_data(size)
    train_start = process_time()
    for idx, text in enumerate(x_train):
        v = model(text)[0]
        try:
            t.add_item(idx, v)
        except ValueError:
            continue
    train_stop = process_time()
    train_time = train_stop - train_start
 
    t.build(100)  # 100 trees
    df = pd.DataFrame(columns=['precision', 'average_precision'])
 
    test_start = process_time()
    for idx, (query, films) in enumerate(zip(x_test, y_test)):
        v = model(query)[0]
        top_10 = t.get_nns_by_vector(v, 10)  # find the 10 nearest neighbors
        df2 = calc_map(films, top_10, 10, idx, y_train)
        df= pd.concat([df, df2])
    test_stop = process_time()
    test_time = test_stop - test_start
 
    print(f'done {size}')
    df2 = pd.DataFrame({'embedding': 'Universal Sentence Encoder',
                        'train size': len(y_train),
                        'test size': len(y_test),
                        'training time': round(train_time, 2),
                        'inference time': round(test_time / len(y_test), 2),
                        'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                        'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                        index=[i])
    MAP = pd.concat([MAP, df2])
    MAP.to_csv('MAP_new.csv', index=False)

# Hugging Face MiniLM (Multilingual Transformer Siamese BERT-Network)
*sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2*

* zero-shot
* domain adoptation не работает из-за разницы в train и test сетах

In [None]:
def find_negatives(true_labels, predictions):
    negatives = []
    films = true_labels.split(';')
    for i, pred in enumerate(predictions):
        for film in films:
            if y_train[pred] != film:
                negatives.append((film, y_train[pred]))
    return negatives

In [None]:
def run_sbert(model_name, embedding_name, indices, sizes):
    global MAP
    model = SentenceTransformer(model_name)

    for i, size in zip(indices, sizes):
        f = 384
        t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
        y_train, x_train, y_test, x_test, mapping_dict = open_data(size)
        x_train = [mapping_dict[x] for x in x_train]
        x_test = [mapping_dict[x] for x in x_test]
        train_start = process_time()
        for idx, v in enumerate(model.encode(x_train)):
            try:
                t.add_item(idx, v)
            except ValueError:
                continue
        train_stop = process_time()
        train_time = train_stop - train_start

        t.build(100)  # 100 trees
        df = pd.DataFrame(columns=['precision', 'average_precision'])
        test_start = process_time()
        for idx, (v, films) in enumerate(zip(model.encode(x_test), y_test)):
            top_10 = t.get_nns_by_vector(v, 10)  # find the 10 nearest neighbors
            df2 = calc_map(films, top_10, 10, idx, y_train)
            df = pd.concat([df, df2])
        test_stop = process_time()
        test_time = test_stop - test_start

        print(f'done {size}')
        df2 = pd.DataFrame({'embedding': embedding_name,
                            'train size': len(y_train),
                            'test size': len(y_test),
                            'training time': round(train_time, 2),
                            'inference time': round(test_time / len(y_test), 2),
                            'recall': round(len([1 for prec in df.precision if prec != 0])/len(y_test), 4),
                            'MAP': round(sum(df.average_precision)/len(y_test), 4)},
                           index=[i])
        MAP = pd.concat([MAP, df2])
        MAP.to_csv('MAP_new.csv', index=False)
    return MAP

In [None]:
# run zero-shot SBert
MAP = run_sbert('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
                'Siamese BERT (MiniLM)',
                (42, 43, 44, 45, 46, 47, 48),
                (100, 500, 1000, 5000, 10000, 20000, 30000))

# run domain adopted (2 epochs for embeddings) SBert
MAP = run_sbert('../input/tsdae-model-epoch-2/output/tsdae-model_epoch_2',
                'Siamese BERT (MiniLM) domain adopted',
                (49, 50, 51, 52, 53, 54, 55),
                (100, 500, 1000, 5000, 10000, 20000, 30000))

# run adopted for sources (similarity between VK and Wiki texts) SBert
MAP = run_sbert('../input/minilm-domain-adoptation/adopted_model',
                'Siamese BERT (MiniLM) similarity',
                (56, 57, 58, 59, 60, 61, 62),
                (100, 500, 1000, 5000, 10000, 20000, 30000))