In [10]:
import wordfreq
import markovify
import numpy as np
import spacy
import pandas as pd
import logging

from functools import reduce
from collections import defaultdict, Counter
from sklearn.preprocessing import OrdinalEncoder

from keras.layers import LSTM, Bidirectional, Embedding, Input, Concatenate, Reshape, Dense, Flatten
from keras.models import Model
from keras.utils import to_categorical
from keras_tqdm import TQDMNotebookCallback
from keras.preprocessing.sequence import TimeseriesGenerator
from networkx import shortest_path_length, from_dict_of_dicts, shortest_path

from transformers import *
from helper import *
from corpus_helper import CorpusStreamer, CorpusFileHandler

sp_parser = spacy.load("en")
configure_logging('markov.log', logging.DEBUG)

Using TensorFlow backend.


In [11]:
def starts_with_vowel(word):
    return word.lower()[0] in ["a","e","i","o","u"]

def lexical_freq(word):
    return wordfreq.zipf_frequency(word, lang="en")

In [12]:
mapping = lambda x: (x.tag_, #category
                     x.is_stop, #starts_with_vowel(x.text), #bool
                     x.i - x.sent.start, x.sent.end - x.i, len(x), lexical_freq(x.text)) #number

class TextParser(Pipeline):
    def __init__(self):
        super().__init__([
            ("TextCleaner",TextCleaner()),
            ("TextFeatureExtractor", TextFeatureExtractor(mapping))
        ])

In [13]:
class FeatureEncoder(Pipeline):
    def __init__(self):
        super().__init__([
            ("_union", FeatureUnion([
                ("_discrete_features", Pipeline([
                    ("TypeSelector_Discrete", TypeSelector(["object", "category"])),
                    ("OneHotEncoder", OrdinalEncoder())
                ])),
                ("_boolean_features", Pipeline([
                    ("TypeSelector_Continuous", TypeSelector(["bool"])),
                    ("MinMaxScaler", MinMaxScaler(feature_range=[0,1]))
                ])),
                ("_continous_features", Pipeline([
                    ("TypeSelector_Continuous", TypeSelector(["number"])),
                    ("MinMaxScaler", MinMaxScaler(feature_range=[0,1]))
                ]))
            ]))
        ])
        

In [5]:
class CustomGenerator(TimeseriesGenerator):
    def __getitem__(self,index):
        return self.transformed_out(super().__getitem__(index))
    
    def transformed_out(self, out):
        x, y = out
        
        return \
            {"pos_in": x[:,:,0:1], 
             "rest_in": x[:,:,1:]}, \
            \
            {"pos_out": to_categorical(y[:,0:1], num_classes=nr_tags), 
             "stop_out": y[:,1:2], 
             "freq_out": y[:,-2:-1], 
             "tf_out": y[:,-1:]}

In [7]:
features = TextParser().fit_transform(text)
encoder = FeatureEncoder()
data = encoder.fit_transform(features)

NameError: name 'text' is not defined

In [777]:
tags = encoder.steps[0][1].transformer_list[0][1].steps[1][1].categories_[0]

nr_tags = len(tags)
tag_emb_size = int(np.ceil(nr_tags**(1/4)))
seq_input_len = 10

In [778]:
pos_in = Input((seq_input_len,1,), name="pos_in")

emb = Embedding(nr_tags, tag_emb_size)(pos_in)
emb = Reshape((seq_input_len, tag_emb_size))(emb)
rest_in = Input((seq_input_len, data.shape[1] - 1, ), name="rest_in")

x = Concatenate()([emb,rest_in])
x = Bidirectional(LSTM(100))(x)
x = Dense(50)(x)

pos_out = Dense(nr_tags, activation="softmax", name="pos_out")(x)
stop_out = Dense(1, activation="sigmoid", name="stop_out")(x)
freq_out = Dense(1, activation="relu", name="freq_out")(x)
tf_out = Dense(1, activation="relu", name="tf_out")(x)

model = Model([pos_in, rest_in], [pos_out, stop_out, freq_out, tf_out])
model.compile("adam", ["categorical_crossentropy", "binary_crossentropy", "mse", "mse"], metrics=['accuracy'])

In [779]:
gen = CustomGenerator(data, data, 10, batch_size=32, shuffle=True)

In [781]:
model.fit_generator(gen, epochs=10, callbacks=[TQDMNotebookCallback(leave_inner=True)], verbose=0)

HBox(children=(IntProgress(value=0, description='Training', max=10, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=3924, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Epoch 1', max=3924, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Epoch 2', max=3924, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Epoch 3', max=3924, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Epoch 4', max=3924, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Epoch 5', max=3924, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Epoch 6', max=3924, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Epoch 7', max=3924, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Epoch 8', max=3924, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Epoch 9', max=3924, style=ProgressStyle(description_width='in…




<keras.callbacks.History at 0x27388913d30>

In [791]:
y_pred = model.predict_on_batch(x)
tags[np.argmax(y_pred[0], axis=1)]

### MARKOV

In [14]:
def bining(number, nbins):
    return min(int(number / (1 / nbins)), nbins - 1)

def mapping(word):
    return (word.tag_, 
            word.is_stop*1,
            bining(len(word)/15, 3), 
            bining(lexical_freq(word.text)/10, 3),
            word.dep_, 
            bining(word.i / word.sent.end, 3))

class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, punct_spacing=True):
        self.punct_spacing = punct_spacing
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = re.sub("(\r?\n)+"," ", X)
        X = re.sub("[^A-Za-z,.?! ]","", X)
        if not self.punct_spacing:
            X = re.sub(" ([.!?,])", "\g<1>", X)
        else:
            X = re.sub("([.!?,])", " \g<1>", X)
        X = re.sub(" +"," ", X)
        return X

class TextParser(Pipeline):
    def __init__(self):
        super().__init__([
            ("TextCleaner",TextCleaner()),
            ("TextFeatureExtractor", TextFeatureExtractor(mapping))
        ])
        
def to_style_tokens(text):
    features = TextParser().fit_transform(text)
    tokens_text = " ".join(features.apply(lambda x: "_".join(map(str, x)), axis=1))
    return re.sub(" ([.!?])[^ ]+", "\g<1>", tokens_text)

def pos_emission_prob(folder_path):
    corpus = CorpusStreamer(folder_path, False)
    counts = defaultdict(Counter)
    
    for text in corpus:
        doc = nlp()(TextCleaner().fit_transform(text))
        for w, t in zip(doc, map(lambda x: '_'.join(map(str, x)), map(mapping, doc))):
            counts[t][w.orth_.lower()] += 1
        
    save_folder = os.path.join(folder_path, 'parsed')
    os.makedirs(save_folder, exist_ok=True)
    
    with open(os.path.join(save_folder, 'emission_probs.pkl'), 'wb') as f:
        pkl.dump(counts, f)
        
    return counts


class LowerMarkovifyText(markovify.Text):
    def word_split(self, sentence):
        return re.split(super().word_split_pattern, sentence.lower())
    

def pos_markov_chain(folder_path, state_size=2):
    corpus = CorpusStreamer(folder_path, False)
    saved_chain = None
    saved_pos_chain = None
    
    for i, text in enumerate(corpus):
        logging.debug('POS_MARKOV_CHAIN: {: >5} texts parsed'.format(i+1))
        chain = LowerMarkovifyText(TextCleaner(False).transform(text), state_size=state_size, retain_original=False)
        saved_chain = markovify.combine([saved_chain, chain]) if saved_chain else chain
            
        pos_chain = markovify.Text(to_style_tokens(text), state_size=state_size, retain_original=False)
        saved_pos_chain = markovify.combine([saved_pos_chain, pos_chain]) if saved_pos_chain else pos_chain
    
    save_folder = os.path.join(folder_path, 'parsed')
    os.makedirs(save_folder, exist_ok=True)
    
    with open(os.path.join(save_folder, 'word_mm.pkl'), 'w') as f:
        f.write(saved_chain.to_json())
        
    with open(os.path.join(save_folder, 'pos_mm.pkl'), 'w') as f:
        f.write(saved_pos_chain.to_json())
    
    return saved_chain, saved_pos_chain

def markov_to_graph(markovchain):
    markovmodel = {}

    for k,v in markovchain.chain.model.items():
        _sum = sum(v.values())
        
        markovmodel[k[-1]] = {_k: {"weight":-np.log(_v/_sum)} for _k, _v in v.items()}

    return from_dict_of_dicts(markovmodel)

def limit_style_tokens(word, length=4):
    return '_'.join(word.split('_')[:length])

def find_token_limiting(nodes, word):
    np.random.shuffle(nodes)
    short_nodes = list(map(limit_style_tokens, nodes))
    return nodes[short_nodes.index(limit_style_tokens(word))]

def make_sentence_containing(markovchain, words, tokenize=True, strict=False):
    g = markov_to_graph(markovchain)
    
    if tokenize:
        words = [find_token_limiting(list(g.nodes), to_style_tokens(word)) for word in words]
        
    if not strict:
        np.random.shuffle(words)
        
    tokens = ['___BEGIN__'] + words + ['___END__']
    sentence = []
    for i in range(len(tokens) - 1):
        path = shortest_path(g, tokens[i], tokens[i+1], weight="weight")[1:]
        sentence.extend(path)
    
    return ' '.join(sentence[:-1])

def find_sentence_containing(markovchain, words, tokenize=True, max_tries=10000, max_words=25):
    if tokenize:
        words = list(map(limit_style_tokens, map(to_style_tokens, words)))
        
    for i in range(max_tries):
        sent = markovchain.make_sentence(tries=100, max_words=max_words)
        tmp_sent = sent
        
        for word in words:
            if word in tmp_sent:
                tmp_sent = tmp_sent.replace(word, '', 1)
            else:
                sent = ''
                break
        
        if sent:
            print('Finding a sentence took {} tries.'.format(i))
            return sent
        else:
            continue


def safe_find(array, item):
    try:
        return array.index(item)
    except:
        return -1

def log_normalize_dict(d):
    if isinstance(d, list):
        d = {k: 1 for k in d}

    norm_sum = sum(d.values())
    return {k: np.log(v / norm_sum) for k, v in d.items()}

def normalize_dict(d):
    if isinstance(d, list):
        d = {k: 1 for k in d}

    norm_sum = sum(d.values())
    return {k: v / norm_sum for k, v in d.items()}

def len_norm(n, t, weight=.7):
    if n == 0:
        return 1
    
    norm = n / (1 / n - weight * (n - t)**2 / (1 - t)**2)
#     norm = (5 + n) ** weight / (5 + 1) ** weight
#     norm = n ** weight if n > 5 else n
    return norm

def eos_norm(n, t, weight=0.2, log=True):
    norm = (n+1) / t
    return weight * np.log(norm) if log else norm

def beam_search(word_mm, pos_mm, context_words, beam_size=5, smoothing_prob=1e-6, 
                word_trans_weight=.5, emission_weight=.5, context_weight=.05, eos_norm_weight=.2, len_norm_weight=.7, 
                begin_token='___BEGIN__', end_token='___END__',
                variable_length=True, max_length=30):
    
    weights = [word_trans_weight, emission_weight, context_weight]
    word_trans_weight, emission_weight, context_weight = np.array(weights) / sum(weights)
    
    pos_sent = find_sentence_containing(pos_mm, context_words, max_words=max_length).split()
    n_t = len(pos_sent)
    print(n_t)
#     pos_sent = pos_mm.make_sentence(max_words=max_length).split()
    
    queue = {tuple(): 0}
    i = 0
    
    while True:
        cur_tag = pos_sent[i] if i < len(pos_sent) else ''
        layer_candidates = queue if variable_length and i != 0 else {}
        
        for prev_words, prev_score in queue.items():
#             cur_tag = pos_sent[len(prev_words)] if len(prev_words) < len(pos_sent) else ''
            if (prev_words and prev_words[-1] == end_token) or len(prev_words) == max_length:
                if not variable_length:
                    layer_candidates = {**layer_candidates, **{prev_words: prev_score}}
                continue
                 
            n = len(prev_words)
            prev_state = [begin_token] * (word_mm.chain.state_size - n) + list(prev_words[-word_mm.chain.state_size:])
            
#             transition_candidates = log_normalize_dict(word_mm.chain.model.get(tuple(prev_state), {}))
#             emission_candidates = log_normalize_dict(emission_probs[cur_tag])
#             context_candidates = log_normalize_dict(context_words)

#             merged_log_probs = pd.DataFrame([transition_candidates, emission_candidates, context_candidates])\
#                                 .fillna(np.log(smoothing_prob))\
#                                 .apply(lambda x: sum(x * [word_trans_weight, emission_weight, context_weight]))
            
#             merged_log_probs[end_token] = merged_log_probs.get(end_token, np.log(smoothing_prob)) + eos_norm(len(prev_words), len(pos_sent), eos_norm_weight)
            
#             reduction_words = set(prev_words[-3:]) or (set(prev_words) and set(context_words))
#             for word in reduction_words:
#                 merged_log_probs[word] = np.log(smoothing_prob)
            
#             merged_log_probs -= reduce(np.logaddexp, merged_log_probs)
            
            transition_candidates = normalize_dict(word_mm.chain.model.get(tuple(prev_state), {}))
            emission_candidates = normalize_dict(emission_probs[cur_tag])
            context_candidates = normalize_dict(context_words)

            merged_probs = pd.DataFrame([transition_candidates, emission_candidates, context_candidates])\
                                .fillna(smoothing_prob)\
                                .apply(lambda x: sum(x * [word_trans_weight, emission_weight, context_weight]))
            
            merged_probs[end_token] = merged_probs.get(end_token, smoothing_prob) + eos_norm(n, n_t, eos_norm_weight, False)
            
            reduction_words = set(prev_words[-3:]) or (set(prev_words) and set(context_words))
            for word in reduction_words:
                merged_probs[word] = smoothing_prob if word in merged_probs else 0
            
            merged_probs /= sum(merged_probs)
            merged_log_probs = np.log(merged_probs)
            
            selected_candidates = merged_log_probs.nlargest(beam_size)
            selected_candidates = {tuple(prev_words) + (word,): (prev_score * len_norm(n, n_t, len_norm_weight) + score) / len_norm(n + 1, n_t, len_norm_weight)
                                   for word, score in selected_candidates.items()}
            layer_candidates = {**layer_candidates, **selected_candidates}
        
        old_queue = queue
        queue = {k: layer_candidates[k] for k in sorted(layer_candidates, key=layer_candidates.get, reverse=True)[:beam_size]}
        
        if set(queue.keys()) == set(old_queue.keys()):
            break
            
#         for words, score in queue.items():
#             print(' '.join(words), score)
#         print()
        i += 1
    
    results = dict(filter(lambda x: end_token in x[0], queue.items()))
    if len(results) == 0:
        results = queue
        best = max(results, key=results.get)
        sent = ' '.join(best)
    else:
        best = max(results, key=results.get)
        sent = ' '.join(best[:safe_find(best, end_token)])
        
    print(sent)
    return sent, queue[best]

In [5]:
folder_path = '../DataAcquisition/data/'
for author in os.listdir(folder_path)[2:]:
    path = os.path.join(folder_path, author)
    print(path)
    
    chain, pos_chain = pos_markov_chain(path, state_size=3)
    emission_probs = pos_emission_prob(path)
    #g = markov_to_graph(pos_chain)

../DataAcquisition/data/dickens
2019-04-16 21:47:39,117:INFO:CorpusStreamer: Loading 63 files.
2019-04-16 21:48:16,490:INFO:  6% parsed
2019-04-16 21:48:49,499:INFO: 12% parsed
2019-04-16 21:52:39,025:INFO: 19% parsed
2019-04-16 21:57:22,637:INFO: 25% parsed
2019-04-16 22:00:59,272:INFO: 31% parsed
2019-04-16 22:07:22,788:INFO: 38% parsed
2019-04-16 22:07:54,290:INFO: 44% parsed
2019-04-16 22:18:57,738:INFO: 50% parsed
2019-04-16 22:20:09,884:INFO: 57% parsed
2019-04-16 22:24:28,413:INFO: 63% parsed
2019-04-16 22:25:17,335:INFO: 69% parsed
2019-04-16 22:26:17,288:INFO: 76% parsed
2019-04-16 22:27:14,821:INFO: 82% parsed
2019-04-16 22:35:06,931:INFO: 88% parsed
2019-04-16 22:43:36,353:INFO: 95% parsed
2019-04-16 22:44:03,431:INFO:100% parsed
2019-04-16 22:44:18,306:INFO:CorpusStreamer: Loading 63 files.
2019-04-16 22:45:00,025:INFO:  6% parsed
2019-04-16 22:45:32,697:INFO: 12% parsed
2019-04-16 22:48:01,181:INFO: 19% parsed
2019-04-16 22:50:21,822:INFO: 25% parsed
2019-04-16 22:51:20,00

In [10]:
' '.join([emission_probs[tag].most_common(1)[0][0] for tag in pos_chain.make_sentence().replace('.', '').split()])

'thing is of so particular man , those of these world i think i am an mediaeval man of the man of saying from spectacles and the other , which stood so so is the thing and one of the man that he has not called to be right of the man that when of world , the thing had been flung or that there is an thing'

In [386]:
words = ['war', 'men', 'created']

In [430]:
probs = beam_search(chain, pos_chain, emission_probs, words, 
                    beam_size=10, 
                    word_trans_weight=1, 
                    emission_weight=1, 
                    context_weight=.2, 
                    eos_norm_weight=0, 
                    len_norm_weight=.05,
                    smoothing_prob=1e-6,
                    variable_length=True)

Finding a sentence took 891 tries.
18




i the prince silenced him by a simultaneous movement towards him, each , i have been


In [437]:
cnts = sum((Counter(x) for x in chain.chain.model.values()), Counter())