In [77]:
%load_ext autoreload
%autoreload 2

import sys
import os
from os.path import join as pjoin
sys.path.append('../')

from gensim.models import Word2Vec, KeyedVectors

from tst.io import AUTHORS, translation_embedding_dir
from tst.preprocess.parsing import find_anker_words
from tst.preprocess.markov import pos_emission_prob, pos_markov_chain, vocabulary, beam_search, \
    load_emission_probs, load_chain, load_pos_chain, beam_search2
from tst.preprocess.translate import translate_to_author, translate
from tst.preprocess.w2v_extensions import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create

In [None]:
chains, pos_chains, emission_probs = {}, {}, {}
for author in os.listdir(AUTHORS)[2:]:
    chains[author], pos_chains[author] = pos_markov_chain(author, state_size=3)
    emission_probs[author] = pos_emission_prob(author)
    #g = markov_to_graph(pos_chain)

In [None]:
dicts = {}
for author in os.listdir(AUTHORS):
    if author == 'all':
        wv = Word2Vec.load(pjoin(AUTHORS, author, 'parsed', 'gutenberg_w2v_5e.model')).wv
        vocab = {k: v.count for k, v in wv.vocab.items()}
    else:
        vocab = vocabulary(chains[author])
#         wv = KeyedVectors.load_word2vec_format(pjoin(AUTHORS, author, 'parsed', 'only.vec'))
#         vocab = {k: v.count for k, v in wv.vocab.items()}
        
    dicts[author] = normalize_dict(sorted(vocab, key=vocab.get, reverse=True))
    with open(pjoin(AUTHORS, author, 'parsed', 'dict.txt'), 'w') as f:
        f.writelines(map(lambda x: '{} {}\n'.format(x[0], x[1]), dicts[author].items()))

In [None]:
for author in os.listdir(AUTHORS):
    if author == 'all':
        continue
        
    with open(pjoin(AUTHORS, author, 'parsed', 'ankers.txt'), 'w') as f:
        anker_words = find_anker_words(dicts['all'], dicts[author])
        for word in anker_words.keys():
            f.write(f'{word} {word}\n')

## Load

In [16]:
chains, pos_chains, emission_probs, embeddings = {}, {}, {}, {}
for author in os.listdir(AUTHORS):
    if author == 'all':
        continue
    
    chains[author] = load_chain(author)
    pos_chains[author] = load_pos_chain(author)
    emission_probs[author] = load_emission_probs(author)
    
    folder = translation_embedding_dir(author)
    embeddings[author] = [KeyedVectors.load_word2vec_format(pjoin(folder, 'vectors-all.txt')),
                          KeyedVectors.load_word2vec_format(pjoin(folder, f'vectors-{author}.txt'))]

## Evaluate

In [107]:
author = 'darwin'

input_sent = 'It is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife.'
words = translate(input_sent, embeddings[author][0], embeddings[author][1], .2)[0]
words

array(['it', 'is', 'a', 'truth', 'universally', 'expressed', 'that', 'a',
       'any', 'neison', 'into', 'appearance', 'of', 'a', 'pleasant',
       'chance', 'will', 'be', 'into', 'expect', 'of', 'a',
       'black-necked', '.'], dtype='<U12')

In [96]:
probs = beam_search2(chains[author], 
                    pos_chains[author], 
                    emission_probs[author], 
                    words, 
                    
                    beam_size=5, 
                    word_trans_weight=1, 
                    emission_weight=10, 
                    context_weight=0, 
                    eos_norm_weight=.8, 
                    len_norm_weight=.1,
                    smoothing_prob=1e-1,
                    variable_length=True)

['___BEGIN__', '___BEGIN__', '___BEGIN__']
31196
['___BEGIN__', '___BEGIN__', 'PRP_1_0_2_ROOT_0']
0
['___BEGIN__', '___BEGIN__', 'PRP_0_0_1_ROOT_0.']
0
['___BEGIN__', '___BEGIN__', 'UH_0_0_1_ROOT_0.']
0
['___BEGIN__', '___BEGIN__', 'UH_0_0_1_ROOT_0', ',_0_0_0_punct_1']
0


  sorted(layer_candidates, key=layer_candidates.get, reverse=True)[:beam_size]}


['___BEGIN__', 'UH_0_0_1_intj_0', ',_0_0_0_punct_1', 'PRP_1_0_2_ROOT_2']
0
['___BEGIN__', 'UH_0_0_1_intj_0', ',_0_0_0_punct_1', 'PRP_1_0_2_ROOT_2']
0
['___BEGIN__', 'UH_0_0_1_intj_0', ',_0_0_0_punct_1', 'VBD_0_0_1_ROOT_2']
0
['UH_0_0_1_intj_0', ',_0_0_0_punct_0', 'PRP_1_0_2_nsubj_1', 'VBP_0_0_1_ROOT_2']
0
['UH_0_0_1_intj_0', ',_0_0_0_punct_0', 'VBD_0_0_1_ROOT_1', 'PRP_0_0_1_intj_1.']
0
['VBD_0_0_1_ROOT_0', 'PRP_0_0_1_intj_1.', 'NN_0_1_0_ROOT_1', ',_0_0_0_punct_2']
0
['VBD_0_0_1_ROOT_0', 'PRP_0_0_1_intj_1.', 'NN_0_1_0_ROOT_1.']
0
['VBD_0_0_1_ROOT_0', 'NN_0_0_1_intj_1.', 'NN_0_1_1_ROOT_1', ',_0_0_0_punct_2']
0
['PRP_0_0_1_ROOT_0.', 'NN_0_1_0_dep_1', ',_0_0_0_punct_1', 'PRP_1_0_2_ROOT_2']
0
['PRP_0_0_1_ROOT_0.', 'NN_0_1_0_ROOT_1', ',_0_0_0_punct_1', 'WRB_1_0_1_advmod_2']
0
['NN_0_1_0_ROOT_0', ',_0_0_0_punct_0', 'WRB_1_0_1_prep_1', 'PRP_1_0_2_conj_2']
0
['NN_0_1_0_npadvmod_0', ',_0_0_0_punct_0', 'PRP_1_0_2_nsubj_1', 'VBP_0_2_1_ROOT_1.']
0
['NN_0_1_0_dep_0', ',_0_0_0_punct_0', 'PRP_1_0_2_ns