In [None]:
RESOURCES_PATH = '../../resources'

In [None]:
from pathlib import Path
import re
import pickle
import numpy as np
import pandas as pd
import fasttext

In [None]:
cleared_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/budget/cleared.tsv', sep='\t')
orig_df = pd.read_csv(f'{RESOURCES_PATH}/dataset/budget/original.tsv', sep='\t')

cleared_df.fillna('', inplace=True)
orig_df.fillna('', inplace=True)

cleared_df.head()

In [None]:
def clear_text(text):
    lower_cased = text.lower()
    without_special_chars = re.sub(r"[^a-zА-я0-9 ]", '', lower_cased)
    without_excess_spaces = re.sub(r" {2,}", ' ', without_special_chars)
    stripped = without_excess_spaces.strip()
    return stripped

clear_text('Hello World, A4 "Привет мир": 8394! » | ¶ 42')

In [None]:
objects = list(set(cleared_df.object.unique()).union(orig_df.object.unique()))
projects = list(set(cleared_df.project.unique()).union(orig_df.project.unique()))

len(objects), len(projects)

## Embed phrases

In [None]:
Path(f'{RESOURCES_PATH}/cache/budget').mkdir(parents=True, exist_ok=True)

In [None]:
def get_embedding_map(to_vector_fn):
    result = {
        'object': {},
        'project': {'': np.array([])}
    }

    for obj in objects:
        result['object'][obj] = to_vector_fn(clear_text(obj))
    
    for project in projects:
        result['project'][project] = to_vector_fn(clear_text(project))

    return result

### fastText

In [None]:
ft_model = fasttext.load_model(f'{RESOURCES_PATH}/pretrained/dp-fasttext.bin')

In [None]:
def to_fasttext_vector(phrase):
    return np.array(list(map(ft_model.get_word_vector, phrase.split())))

to_fasttext_vector('привет мир').shape

In [None]:
ft_embedding_map = get_embedding_map(to_fasttext_vector)

In [None]:
with open(f'{RESOURCES_PATH}/cache/budget/fasttext_embedding_map.pkl', 'wb') as fout:
    pickle.dump(ft_embedding_map, fout, pickle.HIGHEST_PROTOCOL)

## Chose optimal max embedding length

### fastText

In [None]:
with open(f'{RESOURCES_PATH}/cache/budget/fasttext_embedding_map.pkl', 'rb') as fin:
    ft_embedding_map = pickle.load(fin)

print(f'''
fastText length quantile:

Objects:
{pd.Series([len(ft_embedding_map['object'][k]) for k in ft_embedding_map['object']]).quantile([.5, .9, .95, .99, .999, 1])}

Description:
{pd.Series([len(ft_embedding_map['project'][k]) for k in ft_embedding_map['project']]).quantile([.5, .9, .95, .99, .999, 1])}

=> phrase length isn't too long so choose 15 to cover future cases
''')