In [1]:
import os
import re
import json
import jsonlines as jl
import joblib
from pathlib import Path
import itertools
from collections import defaultdict

import pickle

import numpy as np
from numpy.linalg import norm

from tqdm import tqdm

import torch
from torch import nn

from keras.preprocessing.text import Tokenizer

  from .autonotebook import tqdm as notebook_tqdm
2022-10-18 11:45:31.543832: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-18 11:45:31.543865: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Useful Constants

In [16]:
#################
# Paths and files
#################

# Data paths
INPUT_PATH = Path("..")/"input"
TEST_TRIPLES = Path(INPUT_PATH)/"test_triples.jsonl"

# Model paths
OUTPUT_PATH = Path("..")/"result"
FASTTEXT = "./ft_all_phr_2022_30_03_top_100000.pkl"
FIT_TOKENIZER = Path(OUTPUT_PATH)/"ft_tokenizer.pkl"
MODEL = Path(OUTPUT_PATH)/"triplet_loss_model.sav"

###########
# Сonstants
###########

# Data fields
ANCHOR = "anchor"
POSITIVE = "positive"
NEGATIVE = "negative"
TYPE2IDX = dict(anchor=0, positive=1, negative=2)

# Devices and threads
GPU_NUM = torch.cuda.device_count()
GPU_IDS = [f'cuda:{_id}' for _id in range(GPU_NUM)]

# Model parameters
VOCAB_SIZE = 20000
EMBEDDING_SIZE = 300
LSTM_SIZE = 128
LSTM_NUM_LAYERS = 3
OUT_EMBEDDING_SIZE = 768
MAX_SEQ_LEN = 32

ANCHOR_NUM = 500

# Loading Data

In [3]:
def load_triples(_file):
    with jl.open(_file, mode="r") as infile:
        triple_lst = [item for item in infile]
    return triple_lst

In [4]:
with open(FIT_TOKENIZER, "rb") as infile:
    TOKENIZER = pickle.load(infile)

In [5]:
test_triples = load_triples(TEST_TRIPLES)

In [6]:
def fetch_unique_texts(_file):
    with jl.open(_file, mode="r") as infile:
        unique_texts = [{item[ANCHOR], item[POSITIVE], item[NEGATIVE]} for item in infile]
        unique_texts = list(set().union(*unique_texts))
    return unique_texts

In [7]:
# Texts from the test corpus
unique_texts = fetch_unique_texts(TEST_TRIPLES)
print("\n\n".join(unique_texts[:5]))

Итоги радиоспортивной олимпиады "Первенство России среди молодежи 2007 года".

И я хотел для себя секретное место.

Через несколько секунд происходит третья стадия высвобождения, и прозрачное водяное вещество, которое является секретом простаты, выпускается снова.

Публичное администрирование – политические науки» (программа двойных дипломов).

Министерством обороны было принято решение об организации серийного производства автомата Калашникова на Ижевском машиностроительном заводе, к тому времени более 140 лет выпускавшем стрелковое оружие.


In [8]:
text2idx = {
    text: str(_id + 1) for _id, text in enumerate(unique_texts)
}

idx2text = {
    _id: text for text, _id in text2idx.items()
}

# Model Artifacts

In [9]:
# Utilities

class _dict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
    def __getstate__(self):
        return self.__dict__
    
    def __setstate__(self, d):
        return self.__dict__.update(d)

In [10]:
class TripletLossModel(nn.Module):
    def __init__(self, criterion=None,
                 init_weights=None,
                 emb_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE,
                 lstm_num_layers=LSTM_NUM_LAYERS,
                 lstm_hidden_size=LSTM_SIZE, is_bi=True,
                 out_emb_size=OUT_EMBEDDING_SIZE,
                 batch_first=True, dropout=0.2, norm_eps=1e-12):
        super(TripletLossModel, self).__init__()
        
        # Parameters
        self.emb_size = emb_size
        self.vocab_size = vocab_size
        self.lstm_num_layers = lstm_num_layers
        self.lstm_hidden_size = lstm_hidden_size
        self.directions = int(is_bi) + 1
        self.out_emb_size = out_emb_size
        
        # Layers
        self.embedding = nn.Embedding(self.vocab_size + 1, self.emb_size)
        self.lstm = nn.LSTM(input_size=self.emb_size, hidden_size=self.lstm_hidden_size,
                            num_layers=self.lstm_num_layers, bidirectional=is_bi,
                            batch_first=batch_first, dropout=dropout)
        self.cosine_similarity = nn.CosineSimilarity(dim=1, eps=1e-6)
        
        self.best = defaultdict(int)
        
    def forward(self, _input):
        anchor_input = self.fetch_triplet_item(_input, ANCHOR)
        positive_input = self.fetch_triplet_item(_input, POSITIVE)
        negative_input = self.fetch_triplet_item(_input, NEGATIVE)
        
        encoded_anchor = self.encode(anchor_input)
        encoded_positive = self.encode(positive_input)
        encoded_negative = self.encode(negative_input)
        
        cosine_pos = self.cos(encoded_anchor, encoded_positive)
        cosine_neg = self.cos(encoded_anchor, encoded_negative)
        
        return _dict(anchor=encoded_anchor,
                    positive=encoded_positive,
                    negative=encoded_negative,
                    cos_sim=(cosine_pos, cosine_neg))
        
    def encode(self, tokenized_seq, _const=0.5):
        encoded_seq = self.embedding(tokenized_seq)
        encoded_seq, _ = self.lstm(encoded_seq)
        encoded_seq = encoded_seq[:, -1, :] * _const
        return encoded_seq
        
    @staticmethod
    def fetch_triplet_item(_input, item_type):
        if TYPE2IDX.get(item_type) is None:
            raise Exception(f"No item type {item_type}")
        return _input[:, TYPE2IDX[item_type], :]
    
    def cos(self, u, v):
        return self.cosine_similarity(u, v)

# MRR Estimation

In [11]:
_model = TripletLossModel()
_model.load_state_dict(torch.load(MODEL, map_location=torch.device("cpu")))

<All keys matched successfully>

In [12]:
def encode_arbitrary_texts(texts):
    indexed_data = TOKENIZER.texts_to_sequences(texts)
    max_seq_len = MAX_SEQ_LEN
        
    # Filling the padded matrix
    _size = len(texts)
    out_matrix = torch.zeros((_size, max_seq_len), dtype=torch.int64)
    for idx in range(_size):
        bound = min(len(indexed_data[idx]), max_seq_len)
        out_matrix[idx, :bound] = torch.from_numpy(
            np.array(indexed_data[idx][:bound]))
        
    text_embeddings = _model.encode(out_matrix)
    return text_embeddings

In [13]:
idx2embedding = {
    _id: encode_arbitrary_texts([text]) for _id, text in tqdm(idx2text.items(), total=282930)
}

100%|██████████| 282930/282930 [1:48:42<00:00, 43.38it/s]  


In [17]:
ranks = []

for item in tqdm(test_triples[:ANCHOR_NUM], total=ANCHOR_NUM):
    anchor_id = text2idx[item[ANCHOR]]
    anchor_embedding = idx2embedding[anchor_id]
    idx2cos = {
        sample_id: _model.cos(anchor_embedding, sample_embedding).item()
        for sample_id, sample_embedding in idx2embedding.items() if sample_id != anchor_id
    }
    idx2cos = {_id: _cos for _id, _cos in sorted(list(idx2cos.items()), key=lambda _: _[1], reverse=True)}
    sampled_ids = list(idx2cos.keys())[:20]
    
    positive_id = text2idx[item[POSITIVE]]
    if positive_id not in sampled_ids:
        ranks.append(0.0)
    else:
        ranks.append(1 / (sampled_ids.index(positive_id) + 1))
        
print("MRR@20 on test:", sum(ranks) / len(ranks))

100%|██████████| 500/500 [1:38:01<00:00, 11.76s/it]

MRR@20 on test: 0.449986342536033



