In [2]:
%load_ext autoreload
%autoreload 2

import logging
import sys
import os
from os.path import join as pjoin
sys.path.append('../../')

from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
import markovify

from tst.io import AUTHORS
from tst.preprocess.parsing import find_anker_words
from tst.preprocess.transformers import TextCleaner, TextFeatureExtractor
from tst.preprocess.helper import configure_logging
from tst.preprocess.markov import lexical_freq, pos_emission_prob, pos_markov_chain, vocabulary, beam_search, \
    LowerMarkovifyText, load_emission_probs, load_chain, load_pos_chain

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
mapping = lambda x: (x.tag_, #category
                     x.is_stop, #starts_with_vowel(x.text), #bool
                     x.i - x.sent.start, x.sent.end - x.i, len(x), lexical_freq(x.text)) #number

class TextParser(Pipeline):
    def __init__(self):
        super().__init__([
            ("TextCleaner",TextCleaner()),
            ("TextFeatureExtractor", TextFeatureExtractor(mapping))
        ])

## Create

In [None]:
chains, pos_chains, emission_probs = {}, {}, {}
for author in os.listdir(AUTHORS)[2:]:
    chains[author], pos_chains[author] = pos_markov_chain(author, state_size=3)
    emission_probs[author] = pos_emission_prob(author)
    #g = markov_to_graph(pos_chain)

In [None]:
dicts = {}
for author in os.listdir(AUTHORS):
    if author == 'all':
        w2v = Word2Vec.load("../data/all/parsed/gutenberg_w2v_5e.model")
        vocab = {k: v.count for k, v in w2v.wv.vocab.items()}
    else:
        chain = chains['author']
        vocab = vocabulary(chain)
    size = sum(vocab.values())
    sortvocab = {k: vocab[k]/size for k in sorted(vocab, key=vocab.get, reverse=True)}
    dicts[author] = sortvocab
    with open(pjoin(AUTHORS, author, 'parsed', 'dict.txt'), 'w') as f:
        f.writelines(map(lambda x: '{} {}\n'.format(x[0], x[1]), sortvocab.items()))

In [102]:
for author in os.listdir(AUTHORS):
    if author == 'all':
        continue
        
    with open(pjoin(AUTHORS, author, 'parsed', 'ankers.txt'), 'w') as f:
        anker_words = find_anker_words(dicts['all'], dicts[author])
        for word in anker_words.keys():
            f.write(f'{word} {word}\n')

## Read

In [None]:
chains, pos_chains, emission_probs = {}, {}, {}
for author in os.listdir(AUTHORS):
    chains[author] = load_chain(author)
    pos_chains[author] = load_pos_chain(author)
    emission_probs[author] = load_emission_probs(author)

## Evaluate

In [None]:
input_sent = 'It is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife.'
# words = translate(input_sent, src_emb, target_emb)

words = ['war', 'men', 'created']
probs = beam_search(chain, pos_chain, words, 
                    beam_size=10, 
                    word_trans_weight=1, 
                    emission_weight=1, 
                    context_weight=.2, 
                    eos_norm_weight=0, 
                    len_norm_weight=.05,
                    smoothing_prob=1e-6,
                    variable_length=True)