In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForPreTraining
import numpy
import pandas as pd
import pymorphy2
from razdel import sentenize
import nltk
import py7zr
from razdel import tokenize
from nltk import sent_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
import re

In [None]:
# собираю словарь ключ - слово, значение - список коретежей вида (номер параграфа, номер предложения, номер слова)
vocabulary = {}
cache = {}
banned_tags = {"NPRO", "PRED", "PREP","CONJ", "PRCL", "INTJ", "PNCT", "UNKN", "NUMB"}
analyzer = pymorphy2.MorphAnalyzer()
idx = 0
chunksize = 10 ** 5
with pd.read_csv('news2017/1.csv', chunksize=chunksize) as reader:
    for chunk in reader:
        chunk.dropna(inplace=True)
        for i in tqdm(range(chunk.shape[0])):
            string_to_parse = chunk.iloc[i].values[1]
            string_to_parse = re.sub(' *\n\n *', '. ', string_to_parse)
            new_str = re.sub(' +|\n *\n| *\xa0| *\n', ' ', string_to_parse)
            sents = sentenize(new_str)
            for j, sent in enumerate(sents):
                words = tokenize(sent.text.lower())
                for k, word in enumerate(words):
                    w = word.text
                    if w not in cache:
                        pm_info = analyzer.parse(w)[0]
                        cache[w] = pm_info
                        w_syn = pm_info
                    else:
                        w_syn = cache[w]
                    if str(w_syn.tag).split(',')[0] not in banned_tags:
                        normal_form = w_syn.normal_form
                        if normal_form not in stopwords:
                            if normal_form not in vocabulary:
                                vocabulary[normal_form] = []
                            vocabulary[normal_form].append((idx, j, k))
            idx += 1

In [None]:
ws = pd.read_csv("WS353-russian-sim.txt")
words = set(ws["Word1"]) | set(ws["Word2"])
pm = pymorphy2.MorphAnalyzer()
for word in words:
    w = pm.parse(word.lower().strip())[0].normal_form
    ws353_words[w] = vocabulary[w]

In [None]:
import pickle

In [None]:
with open('ws353.pkl', 'wb') as f:
    pickle.dump(ws353_words, f)

In [None]:
with open('ws353.pkl', 'rb') as handle:
    ws_353 = pickle.load(handle)

In [None]:
df = pd.read_csv("news2017/1.csv")
df.dropna(inplace=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
model = AutoModelForPreTraining.from_pretrained("cointegrated/rubert-tiny")

In [None]:
def cosine_sim(x, y):
    return (x @ y) / (numpy.linalg.norm(x) * numpy.linalg.norm(y))

In [None]:
vectors = {}

In [None]:
# vectors - словарь статических векторов
n_contexts = 2000
i = 1
for key, value in ws_353.items():
    print(f'Iteration {i} / {len(ws_353)}')
    i += 1
    vector = []
    n = 0
    for idx, n_sent, w_idx in tqdm(value):
        if n >= n_contexts:
            break
        n += 1
        string_to_parse = df.iloc[idx].values[1]
        string_to_parse = re.sub(' *\n\n *', '. ', string_to_parse)
        new_str = re.sub(' +|\n *\n| *\xa0| *\n', ' ', string_to_parse)
        sent = list(sentenize(new_str))[n_sent]
        d = tokenizer(sent.text, return_offsets_mapping=True, return_tensors='pt')
        offset_mapping = d.pop('offset_mapping').detach().numpy()[0]
        try:
            sent_vect = model(**d)[0].detach().numpy()[0]
            word = list(tokenize(sent.text.lower()))[w_idx]
            start = 1
            while start < len(offset_mapping) - 1 and offset_mapping[start][0] != word.start:
                start += 1
            stop = start
            while stop < len(offset_mapping) - 1 and offset_mapping[stop][1] != word.stop:
                stop += 1
            vector.append(sent_vect[start:stop+1].mean(axis=0))
        except:
            continue
    vectors[key] = numpy.mean(vector, axis=0)