In [None]:
from pathlib import Path
import re
import pickle
import numpy as np
import pandas as pd
import fasttext
from transformers import BertModel, BertTokenizer
import torch

In [None]:
cleared_train_df = pd.read_csv('../../resources/dataset/turnover/cleared_train.tsv', sep='\t')
cleared_test_df = pd.read_csv('../../resources/dataset/turnover/cleared_test.tsv', sep='\t')
original_test_df = pd.read_csv('../../resources/dataset/turnover/original_test.tsv', sep='\t')

cleared_train_df.fillna('', inplace=True)
cleared_test_df.fillna('', inplace=True)
original_test_df.fillna('', inplace=True)

cleared_train_df.head()

In [None]:
def clear_text(text):
    lower_cased = text.lower()
    without_special_chars = re.sub(r"[^a-zА-я0-9 ]", '', lower_cased)
    without_excess_spaces = re.sub(r" {2,}", ' ', without_special_chars)
    stripped = without_excess_spaces.strip()
    return stripped

clear_text('Hello World, A4 "Привет мир": 8394! » | ¶ 42')

In [None]:
phrases = set()

phrases = phrases.union(cleared_train_df.nomenclature.unique())
phrases = phrases.union(cleared_train_df.description.unique())

phrases = phrases.union(cleared_test_df.nomenclature.unique())
phrases = phrases.union(cleared_test_df.description.unique())

phrases = phrases.union(original_test_df.nomenclature.unique())
phrases = phrases.union(original_test_df.description.unique())

phrases = list(map(clear_text, phrases))

phrases.remove('')

len(phrases)

In [None]:
Path("../../resources/cache").mkdir(parents=True, exist_ok=True)

## Embed phrases

In [None]:
def get_embedding_map(to_vector_fn):
    result = {'': np.array([np.zeros(300)])}

    for i in range(len(phrases)):
        if i % 1000 == 0: print(f"Embedded {round(i / len(phrases) * 100)}%")
        result[phrases[i]] = to_vector_fn(phrases[i])
    
    print("Completed")

    return result

### fastText

In [None]:
ft_model = fasttext.load_model("../../resources/embedding/dp-fasttext.bin")

In [None]:
def to_fasttext_vector(phrase):
    return np.array(list(map(ft_model.get_word_vector, phrase.split())))

to_fasttext_vector('Привет мир').shape

In [None]:
ft_embedding_map = get_embedding_map(to_fasttext_vector)

with open('../../resources/cache/fasttext_embedding_map.pkl', 'wb') as fout:
    pickle.dump(ft_embedding_map, fout, pickle.HIGHEST_PROTOCOL)

### BERT

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('../../resources/embedding/rubert')
bert_model = BertModel.from_pretrained('../../resources/embedding/rubert')

In [None]:
# TODO: Vectorize

def to_bert_vector(phrase):
    input_ids = torch.tensor([bert_tokenizer.encode(phrase, add_special_tokens=False)])
    layers, _ = bert_model(input_ids)
    return layers.detach().numpy()[0]

to_bert_vector('Привет мир').shape

In [None]:
bert_embedding_map = get_embedding_map(to_bert_vector)

with open('../../resources/cache/bert_embedding_map.pkl', 'wb') as fout:
    pickle.dump(bert_embedding_map, fout, pickle.HIGHEST_PROTOCOL)

## Chose optimal max embedding length

### fastText

In [None]:
with open('../../resources/cache/fasttext_embedding_map.pkl', 'rb') as fin:
    ft_embedding_map = pickle.load(fin)

ft_embedding_len_df = pd.DataFrame({'phrase': phrases, 'len': [len(ft_embedding_map[phrase]) for phrase in phrases]})

print(f'''
fastText length quantile:

{ft_embedding_len_df.len.quantile([.5, .9, .95, .99, .999])}

=> fastText optimal max length is 30
''')

In [None]:
with open('../../resources/cache/bert_embedding_map.pkl', 'rb') as fin:
    bert_embedding_map = pickle.load(fin)

bert_embedding_len_df = pd.DataFrame({'phrase': phrases, 'len': [len(bert_embedding_map[phrase]) for phrase in phrases]})

print(f'''
BERT length quantile:

{bert_embedding_len_df.len.quantile([.5, .9, .95, .99, .999])}

=> BERT optimal max length is 40
''')