In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import re
import pickle
import bz2
import json
from gensim.models import FastText
from pyemd import emd
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [32]:
import fasttext
import gensim

In [29]:
EMB_PATH = f'/home/kuptservol/.deeppavlov/downloads/embeddings/'

In [30]:
! ls {EMB_PATH}

dstc2_fastText_model.bin
ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin
ft_native_300_ru_wiki_lenta_nltk_word_tokenize-char.vec
ft_native_300_ru_wiki_lenta_nltk_word_tokenize.vec
wiki.en.bin


In [20]:
model = fasttext.load_model(f'EMB_PATH/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.bin')



In [34]:
model = gensim.models.KeyedVectors.load_word2vec_format(f'{EMB_PATH}/ft_native_300_ru_wiki_lenta_nltk_word_tokenize.vec')

In [36]:
vectors = model.wv

In [37]:
vectors.most_similar('надежда')

[('надежды…', 0.7496052384376526),
 ('надежду…', 0.7383682727813721),
 ('надежды', 0.6953627467155457),
 ('надежду', 0.6936578154563904),
 ('надеждой', 0.6807190179824829),
 ('надеждах', 0.6601127982139587),
 ('надежд', 0.6483222246170044),
 ('вера', 0.641467273235321),
 ('уверенность', 0.613905131816864),
 ('радость', 0.613206684589386)]

In [22]:
with bz2.BZ2File('words_accent.json.bz2') as fin:
    accents_dict = json.load(fin)

In [23]:
def sound_distance(word1, word2):
    """Фонетическое растояние на основе расстояния Левенштейна по окончаниям
    (число несовпадающих символов на соответствующих позициях)"""
    # эта и некоторые другие функции про фонетику взяты из https://github.com/sberbank-ai/classic-ai/tree/master/examples/phonetic-baseline
    suffix_len = 3
    suffix1 = (' ' * suffix_len + word1)[-suffix_len:]
    suffix2 = (' ' * suffix_len + word2)[-suffix_len:]

    distance = sum((ch1 != ch2) for ch1, ch2 in zip(suffix1, suffix2))
    return distance

In [24]:
def accent_syllable(word):
    """Номер ударного слога в слове"""
    default_accent = (syllables_count(word) + 1) // 2
    return accents_dict.get(word, default_accent)

In [25]:
def syllables_count(word):
    """Количество гласных букв (слогов) в слове"""
    return sum((ch in 'уеыаоэёяию') for ch in word)

In [105]:
def get_most_similar(word_topic, word_to_replace):
    """Подбор из похожих слов более подходящего на замену"""    

    result = word_topic

    try:    
        similar = vectors.most_similar(word_topic)
        similar.append( (word_topic, 1) )

        ms = pd.DataFrame()

        for s in similar:
            same_pos = 0
            origmorph = morph.parse(word_to_replace)[0]
            newmorph = morph.parse(s[0])[0]
            if origmorph.tag.POS == newmorph.tag.POS:
                same_pos = 1

            item = {}
            item['word'] = s[0]
            item['score'] = int(1 - s[1] + same_pos * 3 + (sound_distance(s[0],keyword) * 4) + abs(syllables_count(s[0])-syllables_count(keyword)) + abs(accent_syllable(s[0]) - accent_syllable(keyword)))
            
            ms = ms.append(item, ignore_index=True)

        result = ms.sort_values('score').iloc[0]['word'].lower()
    except:
        pass

    origmorph = morph.parse(word_to_replace)[0]

    inflection = set()
    if origmorph.tag.case != None:
        inflection.add(origmorph.tag.case)
    if origmorph.tag.number != None:
        inflection.add(origmorph.tag.number)
    if origmorph.tag.gender != None:
        inflection.add(origmorph.tag.gender)
    if origmorph.tag.voice != None:
        inflection.add(origmorph.tag.voice)
    if origmorph.tag.person != None:
        inflection.add(origmorph.tag.person)
    if origmorph.tag.tense != None:
        inflection.add(origmorph.tag.tense)

    try:
        return morph.parse(result)[0].inflect(inflection).word
    except:
        return result

In [106]:
get_most_similar('детство', 'страх')

'детство'

In [67]:
poems = pd.read_json('classic_poems.json')
poets = np.unique(poems['poet_id'])

In [68]:
poems.head()

Unnamed: 0,poet_id,title,content
0,pushkin,К Наталье,Pourquoi craindrais-j'e de ie dire?\nC'est Mar...
1,pushkin,Монах,"Песнь первая\nСвятой монах, грехопадение, юбка..."
2,pushkin,Несчастие клита,Внук Тредьяковского Клит гекзаметром песенки п...
3,pushkin,К другу стихотворцу,Арист! и ты в толпе служителей Парнасса!\nТы х...
4,pushkin,Кольна (Подражание Occèану),(Фингал послал Тоскара воздвигнуть на берегах ...


In [69]:
poets

array(['blok', 'esenin', 'mayakovskij', 'pushkin', 'tyutchev'],
      dtype=object)

In [72]:
def cosine_similarity(a, b):
    try:
        vec_a = vectors.get_vector(a)
        vec_b = vectors.get_vector(b)
        cos_sim = np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
        return cos_sim
    except:
        return vectors.wmdistance(a,b)

In [73]:
vectors.wmdistance('страх','ненависть')

3.0783263927493056

In [74]:
vectors.wmdistance('страх','любовь')

5.025126295627562

In [197]:
def get_poem(poet, topics):
    """генерирует стих"""
    reg = re.compile('[^а-яА-ЯёЁ \n]')

    poem = ''

    while len(poem.strip().split('\n')) < 6:
        if poet in poets:
            poem = poems[poems['poet_id']==poet].sample()['content'].values[0]
        else:
            poem = poems.sample()['content'].values[0]

    poem = reg.sub('', poem).strip().lower()

    poem = '\n'.join(poem.split('\n')[:np.random.choice([4,6])])

    injections = pd.DataFrame()

    for topic in topics.split():
        if len(topic) < 3:
            continue

        sim = pd.DataFrame()

        for row in poem.split('\n'):
            for w in row.split():
                if len(w) > 3:
                    w = w.replace(',','')
                    item = {}
                    item['word'] = w
                    item['similarity'] = cosine_similarity(topic, w)
                    sim = sim.append(pd.DataFrame(item, index=[0]))

        replacement = {}
        try:
            replacement['from'] = sim.sort_values('similarity', ascending=False).iloc[0:]['word'].values[0]
            replacement['to'] =  get_most_similar(topic, replacement['from'])
            injections = injections.append(pd.DataFrame(replacement, index=[0]))
            #print(injections)
        except:
            #print(e)
            pass

    txt = '\n'.join(poem.split('\n'))

    for index, row in injections.iterrows():
        txt = txt.replace(row['from'], row['to']) 

    txt_list = txt.split('\n')
    txt_new = ''
    for line in txt_list:
        txt_new += line[:117].capitalize() + '\n' 
    txt = txt_new.strip()

    return txt

In [123]:
def validate_poem(poem):

    poem_list = poem.split('\n')

    lines_count = 0
    lines_maxlen = 0

    for line in poem_list:
        if len(line.strip()) > 1:
            lines_count += 1
        lines_maxlen = max(lines_maxlen, len(line))

    if lines_maxlen > 120:
        return False
    if lines_count < 3:
        return False
    if lines_count > 8:
        return False

    return True

In [181]:
def get_most_similar_poem(poet, topic):
    poems_df = pd.DataFrame()

    for _ in range(10):   
        poem = get_poem(poet, topic)
        poem_string = ''.join(poem.split('\n'))
        item = {}
        item["score"] = np.float(validate_poem(poem)) * cosine_similarity(topic, poem_string)
        item["poem"] = poem

        poems_df = poems_df.append(pd.DataFrame(item, index=[0]))

    txt = poems_df.sort_values('score', ascending=False).iloc[0:]['poem'].values[0]
    print(txt)
    return txt

In [204]:
get_most_similar_poem('', "вертолет самолет")

Под шум и звон однообразный
Под городскую суету
Я ухожу душою праздный
В вертолет во мрак и в пустоту


'Под шум и звон однообразный\nПод городскую суету\nЯ ухожу душою праздный\nВ вертолет во мрак и в пустоту'