In [13]:
%load_ext autoreload
%autoreload 2

import sys
import os
from os.path import join as pjoin
sys.path.append('../')

from gensim.models import Word2Vec

from tst.io import AUTHORS
from tst.preprocess.parsing import find_anker_words
from tst.preprocess.markov import pos_emission_prob, pos_markov_chain, vocabulary, beam_search, \
    load_emission_probs, load_chain, load_pos_chain
from tst.preprocess.translate import translate_to_author

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create

In [None]:
chains, pos_chains, emission_probs = {}, {}, {}
for author in os.listdir(AUTHORS)[2:]:
    chains[author], pos_chains[author] = pos_markov_chain(author, state_size=3)
    emission_probs[author] = pos_emission_prob(author)
    #g = markov_to_graph(pos_chain)

In [None]:
dicts = {}
for author in os.listdir(AUTHORS):
    if author == 'all':
        w2v = Word2Vec.load("../data/all/parsed/gutenberg_w2v_5e.model")
        vocab = {k: v.count for k, v in w2v.wv.vocab.items()}
    else:
        chain = chains['author']
        vocab = vocabulary(chain)
    size = sum(vocab.values())
    sortvocab = {k: vocab[k]/size for k in sorted(vocab, key=vocab.get, reverse=True)}
    dicts[author] = sortvocab
    with open(pjoin(AUTHORS, author, 'parsed', 'dict.txt'), 'w') as f:
        f.writelines(map(lambda x: '{} {}\n'.format(x[0], x[1]), sortvocab.items()))

In [102]:
for author in os.listdir(AUTHORS):
    if author == 'all':
        continue
        
    with open(pjoin(AUTHORS, author, 'parsed', 'ankers.txt'), 'w') as f:
        anker_words = find_anker_words(dicts['all'], dicts[author])
        for word in anker_words.keys():
            f.write(f'{word} {word}\n')

## Read

In [3]:
chains, pos_chains, emission_probs = {}, {}, {}
for author in os.listdir(AUTHORS):
    if author == 'all':
        continue
    chains[author] = load_chain(author)
    pos_chains[author] = load_pos_chain(author)
    emission_probs[author] = load_emission_probs(author)

## Evaluate

In [50]:
author = 'wells'

input_sent = 'It is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife.'
words = translate_to_author(input_sent, author, .3)[0]
# words = ['war', 'men', 'created']
words

array(['it', 'is', 'a', 'truth', 'universally', 'admitted', 'that', 'a',
       'single', 'man', 'in', 'possession', 'of', 'a', 'good', 'success',
       'must', 'be', 'in', 'want', 'of', 'a', 'wife', '.'], dtype='<U11')

In [44]:
probs = beam_search(chains[author], 
                    pos_chains[author], 
                    emission_probs[author], 
                    words, 
                    
                    beam_size=10, 
                    word_trans_weight=.4, 
                    emission_weight=.3, 
                    context_weight=.3, 
                    eos_norm_weight=0, 
                    len_norm_weight=.6,
                    smoothing_prob=1e-6,
                    variable_length=False)

Finding a sentence took 200 tries.
29


  


a . any be chance diminution expect head-quarters expressed into it of is it
