In [None]:
RESOURCES_PATH = '../../resources'

In [None]:
from pathlib import Path
import re
import pickle
import numpy as np
import pandas as pd
import fasttext
from transformers import BertModel, BertTokenizer
import torch

In [None]:
cleared_train_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/cleared_train.tsv', sep='\t')
cleared_test_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/cleared_test.tsv', sep='\t')
original_test_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/turnover/original_test.tsv', sep='\t')

cleared_train_df.fillna('', inplace=True)
cleared_test_df.fillna('', inplace=True)
original_test_df.fillna('', inplace=True)

cleared_train_df.head()

In [None]:
def clear_text(text):
    lower_cased = text.lower()
    without_special_chars = re.sub(r"[^a-zА-я0-9 ]", '', lower_cased)
    without_excess_spaces = re.sub(r" {2,}", ' ', without_special_chars)
    stripped = without_excess_spaces.strip()
    return stripped

clear_text('Hello World, A4 "Привет мир": 8394! » | ¶ 42')

In [None]:
nomenclatures = set()

nomenclatures = nomenclatures.union(cleared_train_df.nomenclature.unique())
nomenclatures = nomenclatures.union(cleared_test_df.nomenclature.unique())
nomenclatures = nomenclatures.union(original_test_df.nomenclature.unique())

nomenclatures = list(nomenclatures)

len(nomenclatures)

In [None]:
descriptions = set()

descriptions = descriptions.union(cleared_train_df.description.unique())
descriptions = descriptions.union(cleared_test_df.description.unique())
descriptions = descriptions.union(original_test_df.description.unique())

descriptions.remove('')

descriptions = list(descriptions)

len(descriptions)

In [None]:
Path(f'{RESOURCES_PATH}/cache').mkdir(parents=True, exist_ok=True)

## Embed phrases

In [None]:
def get_embedding_map(to_vector_fn):
    result = {
        'nomenclature': {},
        'description': {'': np.array([])}
    }

    for i in range(len(nomenclatures)):
        result['nomenclature'][nomenclatures[i]] = to_vector_fn(clear_text(nomenclatures[i]))

    print("Nomenclature embedding is complete")

    for i in range(len(descriptions)):
        if i % 1000 == 0: print(f"Description embedded {round(i / len(descriptions) * 100)}%")
        result['description'][descriptions[i]] = to_vector_fn(clear_text(descriptions[i]))
    
    print("Description embedding is complete")

    return result

### fastText

In [None]:
ft_model = fasttext.load_model(f'{RESOURCES_PATH}/pretrained/dp-fasttext.bin')

In [None]:
def to_fasttext_vector(phrase):
    return np.array(list(map(ft_model.get_word_vector, phrase.split())))

to_fasttext_vector('Привет мир').shape

In [None]:
ft_embedding_map = get_embedding_map(to_fasttext_vector)

with open(f'{RESOURCES_PATH}/cache/fasttext_embedding_map.pkl', 'wb') as fout:
    pickle.dump(ft_embedding_map, fout, pickle.HIGHEST_PROTOCOL)

### BERT

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained(f'{RESOURCES_PATH}/pretrained/rubert')
bert_model = BertModel.from_pretrained(f'{RESOURCES_PATH}/pretrained/rubert')

In [None]:
def to_bert_vector(phrase):
    input_ids = torch.tensor([bert_tokenizer.encode(phrase)])
    layers, _ = bert_model(input_ids)
    return layers.detach().numpy()[0]

to_bert_vector('Привет мир').shape

In [None]:
bert_embedding_map = get_embedding_map(to_bert_vector)

with open(f'{RESOURCES_PATH}/cache/bert_embedding_map.pkl', 'wb') as fout:
    pickle.dump(bert_embedding_map, fout, pickle.HIGHEST_PROTOCOL)

## Chose optimal max embedding length

### fastText

In [None]:
with open(f'{RESOURCES_PATH}/cache/fasttext_embedding_map.pkl', 'rb') as fin:
    ft_embedding_map = pickle.load(fin)

print(f'''
fastText length quantile:

Nomenclature:
{pd.Series([len(ft_embedding_map['nomenclature'][k]) for k in ft_embedding_map['nomenclature']]).quantile([.5, .9, .95, .99, .999])}

Description:
{pd.Series([len(ft_embedding_map['description'][k]) for k in ft_embedding_map['description']]).quantile([.5, .9, .95, .99, .999])}

=> fastText optimal max length:
    nomenclature: 17
    description: 30
''')

In [None]:
with open(f'{RESOURCES_PATH}/cache/bert_embedding_map.pkl', 'rb') as fin:
    bert_embedding_map = pickle.load(fin)

print(f'''
BERT length quantile:

Nomenclature:
{pd.Series([len(bert_embedding_map['nomenclature'][k]) for k in bert_embedding_map['nomenclature']]).quantile([.5, .9, .95, .99, .999])}

Description:
{pd.Series([len(bert_embedding_map['description'][k]) for k in bert_embedding_map['description']]).quantile([.5, .9, .95, .99, .999])}

=> BERT optimal max length:
    nomenclature: 23
    description: 45
''')