In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from spacy.en import English
import numpy as np
import functools

In [2]:
#
def batch_until(lines, condition=''):
    # batch paragraphs
    batch = ''
    for l in lines:
        _l = l.strip()
        if _l == condition and batch != '':
            yield batch
            batch = ''
        if _l == condition:
            continue
        batch += ' ' + _l


def sentence_gen(lines):
    # yield sentences
    for doc in parser.pipe(batch_until(lines), batch_size=10, n_threads=2):
        for s in doc.sents:
            # sentinel values for begin/end sentence
            yield '%s %s %s' % ('xsb xsb', s.text.strip(), 'xse')
            
            
def tst_idx(tst_share=.05):
    # indexes for test data
    return np.random.choice(100, int(100*tst_share), replace=False)
    
    
def trn_tst_split_sentence_gen(filename, tst_idx=None):
    # yeild train/test data
    with open(filename, 'r') as lines:
        for idx, s in enumerate(sentence_gen(lines)):
            if tst_idx is not None and (idx%100) in tst_idx:
                yield s
            elif tst_idx is not None:
                continue
            else:
                yield s

In [3]:
the_trial = './data/the_trial.txt'

# sentence recognition can be non-trivial, so use spaCy
parser = English()

trn_sentence_gen = trn_tst_split_sentence_gen(the_trial)

tstidx = tst_idx()

In [5]:
# INITIALIZE WORD COUNT MATRIX
window_size = 3
ngram = CountVectorizer(strip_accents='ascii', ngram_range=(1, window_size), 
                        lowercase=True, token_pattern='(?u)\\b\\w+\\b')

ngram_dtm = ngram.fit_transform(trn_sentence_gen)

ngram_vocab = np.array(ngram.get_feature_names())
ngram_word_count = np.array(ngram_dtm.sum(axis=0))[0]

words = [w for w in ngram.vocabulary_.keys() if ' ' not in w]

vocab_size = len(words)

_bp = ngram.build_preprocessor()
_bt = ngram.build_tokenizer()
# data transformer for test data
sentence_transformer = lambda s: _bt(_bp(ngram.decode(s)))

In [6]:
ngram_count = lambda w: vocab_size if w=='' else 0.0 if ngram.vocabulary_.get(w) is None else ngram_word_count[ngram.vocabulary_.get(w)] 

def ngram_prob(ngram_str, n_words_doc):
    if ngram_str == '':
        # 0-gram
        return 1/n_words_doc
    conditioning = ngram_str.split()[:-1]
    numer = ngram_count(ngram_str)
    if len(conditioning) == 0:
        # uni-gram
        return numer/n_words_doc
    conditioning = ngram_count(' '.join(conditioning))
    if conditioning == 0.0:
        # out of vocabulary
        return conditioning
    # n-gram
    return ngram_count(ngram_str)/conditioning

In [7]:
# LEARN LAMBDA VALUES FROM EM ALGORITHM

n_words_doc = functools.reduce(lambda a, b: a+b, [ngram_word_count[ngram.vocabulary_.get(w)] for w in words])

# [('xsb xsb now', 'xsb now', 'now', ''),
#  ('xsb now they', 'now they', 'they', ''), ...]
to_ngrams = lambda s: [tuple(' '.join(s[(idx-3+_idx):idx]) for _idx in range(4)) for idx in range(3, len(s)+1)]

# [[0.010, 0.005, 0.065, 0.000], ...]
to_ngram_vals = lambda ngrams: [[ngram_prob(ng, n_words_doc) for ng in ngs] for ngs in ngrams]

# ls: initial lambda values
ls = np.array([.25,.25,.25,.25])

# 5 iterations over EM
for _ in range(5):
    # ngram values
    ng_arr = np.zeros(4)
    for str_hld in trn_tst_split_sentence_gen(the_trial, tstidx):
        transformed = sentence_transformer(str_hld)
        ngrams = to_ngrams(transformed)        
        ngvals = to_ngram_vals(ngrams)
        # Expectation
        expctations = [ls*ngval for ngval in ngvals]
        for e in expctations:
            # z normalizing constant
            z = e.sum()
            ng_arr += e/z

    # Maximization        
    ls = ng_arr/ng_arr.sum()
    print(ls)

[  7.78524401e-01   1.87363737e-01   3.39541878e-02   1.57673601e-04]
[  9.31495186e-01   6.60160085e-02   2.48875833e-03   4.69064058e-08]
[  9.78093474e-01   2.17345508e-02   1.71974955e-04   1.27148778e-11]
[  9.92925219e-01   7.06308529e-03   1.16954631e-05   3.36252720e-15]
[  9.97711739e-01   2.28746918e-03   7.91531641e-07   8.82578036e-19]


In [8]:
# GERNATE NEW TEXT

def gen_word(context):
    r = np.random.random()
    acc_sum = 0.0
    for w in words:
        snt = '%s %s' % (context, w)
        transformed = sentence_transformer(snt)
        ngrams = to_ngrams(transformed)        
        ngvals = to_ngram_vals(ngrams)[0]
        acc_sum += (ls*ngvals).sum()
        if acc_sum>=r:
            if w == 'xsb':
                continue
            if w == 'xse':
                return '.'
            return w

def gen_sentence():
    context = ['xsb', 'xsb']
    sent = []
    while True:
        w = gen_word(' '.join(context))
        if w == context[1]:
            continue
        sent.append(w)
        context[0] = context[1]
        context[1] = w
        if w == '.':
            return ' '.join(sent)


for _ in range(5):
    print(gen_sentence() + '\n')

why did the job i ve attacked you added k .

the businessman and anyway i want to miss montag made a sign from her fault or not .

block listened closely with his hand and then in the country had not seen the two of them if i m concerned i ve heard about your case there was nowhere to be dismissed from his work when he had been unable to ask her advice .

there were however also to have a few bland phrases which the court offices .

all of them had gathered together down in the waiting room here it would never have arisen in the pockets of his collar .



In [9]:
# PERPLEXITY MODEL EVALUATION 

tst_n = 0 # length normalization
accum_prob = 0
for str_hld in trn_tst_split_sentence_gen(the_trial, tstidx):
    tstrsplt=sentence_transformer(str_hld)
    ngrams=[tuple(' '.join(tstrsplt[(idx-3+_idx):idx]) for _idx in range(4)) for idx in range(3, len(tstrsplt)+1)]
    tst_n += len(ngrams)
    ngvals=[[ngram_prob(ng, n_words_doc) for ng in ngs] for ngs in ngrams]
    x = [ls*ngval for ngval in ngvals]
    for _x in x:
        accum_prob -= np.log(_x.sum())
       
np.exp(accum_prob/tst_n)

4.4592283199900509