In [23]:
import os
import numpy as np
import re
import pandas as pd
import string
import codecs
from sklearn.utils import shuffle

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [15]:
FILE_UNIGRAMS = '1grams-3.txt'
N_WORDS = 192689044
TOL = 1e-20
FILE_TEXT_p = pd.read_pickle("df")
FILE_TEXT_p = np.asarray(FILE_TEXT_p.text)

In [18]:
FILE_TEXT_arr = shuffle(FILE_TEXT_p)

In [19]:
FILE_TEXT = FILE_TEXT_arr[0]
FILE_TEXT

'Denver, Colorado (CNN) Magic, it seemed, was possible any time Dorien Bryant touched the ball.\n\nElectric runs like the 98-yard opening kickoff return for a touchdown in front of 108,000 at Penn State, or the 95-yard score at home against Northwestern, or the touchdown catches of 53 and 41 yards against Indiana State peppered his college football career.\n\nBryant left Purdue University in 2007 as the school\'s career leader in all-purpose yards, setting multiple Big Ten conference records before signing with the Pittsburgh Steelers.\n\nYet just a year later, the playmaker was out of football, walking away from an offer from the Tennessee Titans and reportedly ignoring a call from the Dallas Cowboys.\n\n"I came out when I was 24 or 25," Bryant tells CNN Sport of his decision to live openly as a gay athlete about a year after ending his NFL pursuit. "It was more of my choice, I didn\'t try and push it."\n\n"I never wanted to be a figurehead for a movement; I don\'t want to be that guy

In [20]:
EXCLUDE_SYMBOLS_STR = u''.join(['№', '«', 'ђ', '°', '±', '‚', 'ћ', '‰', '…', '»', 'ѓ', 'µ', '·', 'ґ', 'њ', 'ї', 'џ', 'є',
                                '‹', '‡', '†', '¶', 'ќ', '€', '“', 'ў', '§', '„', '”', '\ufeff', '’', 'љ', '›', '•', '—',
                                '‘', '\x7f', '\xad', '¤', '\xa0'])

In [24]:
frequencies = {}

with codecs.open(FILE_UNIGRAMS, 'r', 'utf_8_sig') as f:
    for line in f:
        split = line.strip().split('\t')
        try:
            freq, word = split
        except:
            print(split)
        frequencies[word] = float(freq) / N_WORDS

In [25]:
len(frequencies)

1054210

In [26]:
regex_puncts = re.compile('[%s]' % re.escape(string.punctuation))
regex_symbs = re.compile('[%s]' % re.escape(EXCLUDE_SYMBOLS_STR))
regex_digits = re.compile('[%s]' % re.escape(string.digits))
regex_spaces = re.compile('[%s]' % string.printable + string.whitespace)

In [28]:
txt_prep = regex_puncts.sub('', FILE_TEXT)

In [29]:
txt_prep = regex_symbs.sub('', txt_prep)


In [30]:
txt_prep = regex_digits.sub('', txt_prep)

In [31]:
txt_prep = regex_spaces.sub('', txt_prep)

In [32]:
txt_prep = txt_prep.strip().strip('\t').replace('\n', '').lower()

In [33]:
txt_prep

'denver colorado cnn magic it seemed was possible any time dorien bryant touched the ballelectric runs like the yard opening kickoff return for a touchdown in front of  at penn state or the yard score at home against northwestern or the touchdown catches of  and  yards against indiana state peppered his college football careerbryant left purdue university in  as the schools career leader in allpurpose yards setting multiple big ten conference records before signing with the pittsburgh steelersyet just a year later the playmaker was out of football walking away from an offer from the tennessee titans and reportedly ignoring a call from the dallas cowboysi came out when i was  or  bryant tells cnn sport of his decision to live openly as a gay athlete about a year after ending his nfl pursuit it was more of my choice i didnt try and push iti never wanted to be a figurehead for a movement i dont want to be that guydorien bryant set records in four years at purdue university but walked away

In [34]:
words = txt_prep.split()

print(len(words))

3013


In [35]:
frequencies_empirical = {}

for word in log_progress(words):
    if frequencies_empirical.get(word, None) is None:
        frequencies_empirical[word] = 1
    else:
        frequencies_empirical[word] += 1

VBox(children=(HTML(value=''), IntProgress(value=0, max=3013)))

In [37]:
def perplexity(docs):
    docs = np.asarray(docs)
    out_of_dict = []
    
    tmp_sum_docs = 0.0
    N = 0.0
    for doc in log_progress(docs):
        tmp_sum_words = 0.0
        for word in log_progress(words):
            freq = frequencies.get(word, TOL)
            freq_empirical = frequencies_empirical.get(word, 0.0)
            N += freq_empirical
            if freq == TOL:
                out_of_dict.append(word)
            tmp_sum_words += freq_empirical * np.log(freq)
        tmp_sum_docs += tmp_sum_words
        
    return np.exp(-tmp_sum_docs / N), out_of_dict

In [38]:
perp, ood = perplexity([words])

print(perp)

VBox(children=(HTML(value=''), IntProgress(value=0, max=1)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=3013)))

967264.2410476746


In [39]:
print(np.unique(ood))

['aback' 'acceptableduring' 'accepted' 'actiona' 'adding' 'adds' 'admits'
 'afraidthe' 'afterthoughtit' 'afterward' 'agowere' 'agrees' 'airlines'
 'allencompassing' 'allison' 'allotted' 'allowed' 'allpurpose' 'amazing'
 'angeles' 'anthem' 'anthems' 'anyone' 'arizona' 'armsi' 'army' 'aszur'
 'athlete' 'athletes' 'athletic' 'athleticism' 'atlanta' 'austin'
 'avoided' 'backbryant' 'backseat' 'ballelectric' 'bartender' 'basically'
 'battled' 'battling' 'became' 'beforebryant' 'began' 'belti' 'benjamin'
 'berlin' 'besides' 'bidding' 'bids' 'billy' 'boasting' 'boilermaker'
 'boost' 'boston' 'bowl' 'bowleveryone' 'bowli' 'bowls' 'boyfriend'
 'broncos' 'bryant' 'budding' 'buses' 'buzinski' 'byrant' 'camaraderie'
 'canada' 'canadas' 'canadian' 'capped' 'captain' 'caption' 'careerbryant'
 'caresthese' 'carolina' 'carolna' 'catches' 'catching' 'celebration'
 'ceremony' 'championship' 'checked' 'chicago' 'chippy' 'chris'
 'churchgoing' 'closing' 'cnn' 'coachingim' 'cofounded' 'cofounder'
 'collisi