In [1]:
import re
import nltk
import pandas as pd
import numpy as np
from IPython.display import display, Markdown, HTML

## Divide into training and testing corpus

In [2]:
training_files = [
    '../files/allan_poe/berenice.txt',
    '../files/allan_poe/decenso_al_maelstron.txt',
    '../files/allan_poe/el_barril_de_amontillado.txt',
    '../files/allan_poe/el_diablo_en_el_campanario.txt',
    '../files/allan_poe/el_engaño_del_globo.txt',
    '../files/allan_poe/el_entierro_prematuro.txt',
    '../files/allan_poe/el_hombre_de_la_multitud.txt',
    '../files/allan_poe/el_retrato_oval.txt',
    '../files/allan_poe/gato_negro.txt',
    '../files/allan_poe/la_caída_de_la_casa_usher.txt',
    '../files/allan_poe/la_esfinje.txt',
    '../files/allan_poe/la_máscara_de_la_muerte_roja.txt',
    '../files/allan_poe/lady_ligeia.txt',

    '../Files/cien_soledad.txt',
    '../Files/el_principito.txt',
    '../Files/la_isla_del_tesoro.txt',
]
testing_files = [
    '../files/allan_poe/metzengerstein.txt',
    '../files/allan_poe/morerlla.txt',
    '../files/allan_poe/william_wilson.txt',
]

In [3]:
# Create training corpus
texts = []
for file_name in training_files:
    file = open(file_name, 'r')
    texts += [file.read()]
    file.close()
texts = pd.Series(texts, index=training_files)
training_text = ''.join(texts)

texts

../files/allan_poe/berenice.txt                        La desdicha es diversa. La desgracia cunde mul...
../files/allan_poe/decenso_al_maelstron.txt            Habíamos alcanzado la cumbre del despeñadero m...
../files/allan_poe/el_barril_de_amontillado.txt        Lo mejor que pude había soportado las mil inju...
../files/allan_poe/el_diablo_en_el_campanario.txt      Todos saben de una manera vaga que el lugar má...
../files/allan_poe/el_engaño_del_globo.txt             ¡Asombrosas noticias por expreso, vía Norfolk!...
../files/allan_poe/el_entierro_prematuro.txt           Hay ciertos temas de interés absorbente, pero ...
../files/allan_poe/el_hombre_de_la_multitud.txt        Con razón se ha dicho de cierto libro alemán q...
../files/allan_poe/el_retrato_oval.txt                 El castillo en el cual mi criado se le había o...
../files/allan_poe/gato_negro.txt                      No espero ni pido que alguien crea en el extra...
../files/allan_poe/la_caída_de_la_casa_usher.txt       

In [4]:
# Create testing corpus
texts = []
for file_name in testing_files:
    file = open(file_name, 'r')
    texts += [file.read()]
    file.close()
texts = pd.Series(texts, index=testing_files)
testing_text = ''.join(texts)
texts

../files/allan_poe/metzengerstein.txt    El horror y la fatalidad han estado al acecho ...
../files/allan_poe/morerlla.txt          Consideraba yo a mi amiga Morella con un senti...
../files/allan_poe/william_wilson.txt    Permitan que, por el momento, me presente como...
dtype: object

## Tokenization

In [5]:
END_SENTENCE = '<br>'

text = training_text.lower()
text = re.sub(r'[-)(\s«»"]+', ' ', text) # Ignored characters
text = re.sub(r'\.+\s*', END_SENTENCE, text) # Replace end of sentences
text = re.sub(r'\d+', '<n>', text) # Unify numbers

tokens = re.finditer(r'[\wáéíóúñ]+|<(n|br)>|[,¿?!¡;:]', text)
tokens = pd.Series([m.group(0) for m in tokens])

tokens

0               la
1         desdicha
2               es
3          diversa
4             <br>
            ...   
309510        ocho
309511      piezas
309512          de
309513           á
309514        ocho
Length: 309515, dtype: object

# 
## Unigrams

In [6]:
unigrams, counts = np.unique(tokens, return_counts=True)
unigrams = pd.DataFrame(counts, index=unigrams, columns=['count'])
unigrams = unigrams.sort_values(by='count', ascending=False)

unigrams['prob'] = unigrams['count'] / unigrams['count'].sum()
unigrams['cumsum'] = unigrams['prob'].cumsum()

unigrams

Unnamed: 0,count,prob,cumsum
de,16470,0.053212,0.053212
",",12800,0.041355,0.094567
la,10804,0.034906,0.129474
que,9581,0.030955,0.160428
y,9006,0.029097,0.189526
...,...,...,...
infundado,1,0.000003,0.999987
infundieron,1,0.000003,0.999990
infundir,1,0.000003,0.999994
infundirle,1,0.000003,0.999997


## Bigrams

In [7]:
bigrams = np.fromiter(nltk.ngrams(tokens, 2), dtype=('<U18, <U18'))
bigrams, counts = np.unique(bigrams, return_counts=True, axis=0)

bigrams = pd.MultiIndex.from_tuples(list(bigrams), names=['w1', 'w2'])

bigrams = pd.DataFrame(counts, index=bigrams, columns=['count'])
bigrams = bigrams.sort_values(by='count', ascending=False)
bigrams['prfix-sum'] = bigrams['count'].groupby(level='w1').transform(np.sum)

bigrams['prob'] = bigrams['count'] / bigrams['prfix-sum']
bigrams['cumsum'] = bigrams.prob.groupby(level=['w1']).cumsum()

bigrams

Unnamed: 0_level_0,Unnamed: 1_level_0,count,prfix-sum,prob,cumsum
w1,w2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
de,la,2369,16470,0.143837,0.143837
",",y,2067,12800,0.161484,0.161484
en,el,1258,7421,0.169519,0.169519
en,la,1118,7421,0.150654,0.320172
de,los,856,16470,0.051973,0.195811
...,...,...,...,...,...
klim,",",1,1,1.000000,1.000000
kircher,y,1,1,1.000000,1.000000
kilómetros,más,1,6,0.166667,1.000000
casi,desmayo,1,196,0.005102,1.000000


## Trigrams

In [8]:
trigrams = np.fromiter(nltk.ngrams(tokens, 3), dtype=('<U18, <U18, <U18'))
trigrams, counts = np.unique(trigrams, return_counts=True, axis=0)

trigrams = pd.DataFrame(trigrams)
trigrams['w1'] = trigrams[['f0', 'f1']].apply(" ".join, axis=1)
trigrams['w2'] = trigrams[['f2']]
trigrams = pd.MultiIndex.from_frame(trigrams[['w1', 'w2']])

trigrams = pd.DataFrame(counts, index= trigrams, columns=['count'])
trigrams = trigrams.sort_values(by='count', ascending=False)
trigrams['prfix-sum'] = trigrams['count'].groupby(level='w1').transform(np.sum)

trigrams['prob'] = trigrams['count'] / trigrams['prfix-sum']
trigrams['cumsum'] = trigrams.prob.groupby(level=['w1']).cumsum()

trigrams

Unnamed: 0_level_0,Unnamed: 1_level_0,count,prfix-sum,prob,cumsum
w1,w2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
coronel aureliano,buendía,191,198,0.964646,0.964646
josé arcadio,buendía,162,374,0.433155,0.433155
sin embargo,",",137,210,0.652381,0.652381
", y",que,133,2067,0.064344,0.064344
el coronel,aureliano,130,196,0.663265,0.663265
...,...,...,...,...,...
encerró con,josé,1,5,0.200000,0.400000
encerró con,llave,1,5,0.200000,0.600000
encerró con,su,1,5,0.200000,0.800000
encerró con,tranca,1,5,0.200000,1.000000


## 4-grams

In [9]:
fourgrams = np.fromiter(nltk.ngrams(tokens, 4), dtype=('<U18, <U18, <U18, <U18'))
fourgrams, counts = np.unique(fourgrams, return_counts=True, axis=0)

fourgrams = pd.DataFrame(fourgrams)
fourgrams['w1'] = fourgrams[['f0', 'f1', 'f2']].apply(" ".join, axis=1)
fourgrams['w2'] = fourgrams[['f3']]
fourgrams = pd.MultiIndex.from_frame(fourgrams[['w1', 'w2']])

fourgrams = pd.DataFrame(counts, index= fourgrams, columns=['count'])
fourgrams = fourgrams.sort_values(by='count', ascending=False)
fourgrams['prfix-sum'] = fourgrams['count'].groupby(level='w1').transform(np.sum)

fourgrams['prob'] = fourgrams['count'] / fourgrams['prfix-sum']
fourgrams['cumsum'] = fourgrams.prob.groupby(level=['w1']).cumsum()

fourgrams

Unnamed: 0_level_0,Unnamed: 1_level_0,count,prfix-sum,prob,cumsum
w1,w2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
el coronel aureliano,buendía,126,130,0.969231,0.969231
", sin embargo",",",71,72,0.986111,0.986111
<br> sin embargo,",",60,60,1.000000,1.000000
"remedios , la",bella,53,54,0.981481,0.981481
santa sofía de,la,53,53,1.000000,1.000000
...,...,...,...,...,...
el pelo hacia,adentro,1,1,1.000000,1.000000
el pelo hirsuto,y,1,1,1.000000,1.000000
el pelo mojado,y,1,1,1.000000,1.000000
el pelo pintado,de,1,1,1.000000,1.000000


# Generate text
## Unigrams

In [10]:
n_sentences = 5
words = []
while n_sentences > 0:
    p = np.random.rand()
    row_struct = unigrams[unigrams['cumsum'] > p].iloc[0]
    words += [row_struct.name]
    if row_struct.name == '<br>': 
        n_sentences-=1

HTML(' '.join(words))

## Bigrams

In [11]:
n_sentences = 3
words = [END_SENTENCE]
while n_sentences > 0:
    p = np.random.rand()
    prev_word = words[-1]
    row_struct = bigrams[bigrams['cumsum'] > p].loc[prev_word].iloc[0]
    words += [row_struct.name]
    if row_struct.name == END_SENTENCE: 
        n_sentences-=1

HTML(' '.join(words))

## Trigrams

In [12]:
n_sentences = 5

# Generarte from bigrams
words = words[:2]
while n_sentences > 0:
    p = np.random.rand()
    prev_word = ' '.join(words[-2:])
    row_struct = trigrams[trigrams['cumsum'] > p].loc[prev_word].iloc[0]
    words += [row_struct.name]
    if row_struct.name == END_SENTENCE: 
        n_sentences-=1

HTML(' '.join(words))

## Four-Grams

In [13]:
n_sentences = 3

# Generarte from trigrams
words = words[:3]
while n_sentences > 0:
    p = np.random.rand()
    prev_word = ' '.join(words[-3:])
    row_struct = fourgrams[fourgrams['cumsum'] > p].loc[prev_word].iloc[0]
    words += [row_struct.name]
    if row_struct.name == END_SENTENCE: 
        n_sentences-=1

HTML(' '.join(words))

# Zero words and smoothing

For this step we are going to take anotherr aprroach. using numpy arrays of dimesion n, wherer n is the size of the vocabularry including $<UNK>$ token


In [14]:
UNK = '<unk>'
vocabulary = np.unique(tokens)
vocabulary = np.append(vocabulary, UNK)
voc_len = len(vocabulary)

display(Markdown(f'''
lenght of vocabulary is **{voc_len}** including ```{UNK}``` token
'''))


lenght of vocabulary is **25641** including ```<unk>``` token


## Unigrams

In [15]:

_, frecs = np.unique(tokens, return_counts=True)
frecs = np.append(frecs, [0])
unigrams = pd.DataFrame(frecs, index=vocabulary, columns=['frec'], dtype=np.int16)

# Laplace smoothing
unigrams = unigrams + 1
unigrams['prob'] = unigrams / unigrams.sum()

# unigrams = unigrams / unigrams.sum()
unigrams

Unnamed: 0,frec,prob
!,237,0.000707
",",12801,0.038194
:,241,0.000719
;,259,0.000773
<br>,7479,0.022315
...,...,...
úrsula,515,0.001537
útero,2,0.000006
útil,12,0.000036
útiles,6,0.000018


## Bigrams

In [16]:
bigrams = np.fromiter(nltk.ngrams(tokens, 2), dtype=('<21U, <21U'))
frecs = np.unique(bigrams, return_counts=True)
bigrams = pd.DataFrame(.5, index=vocabulary, columns=vocabulary)

for (w1, w2), frec in zip(*frecs):
    bigrams[w1].loc[w2] += frec
bigrams = bigrams.divide(bigrams.sum(axis=1), axis=0)

bigrams

Unnamed: 0,!,",",:,;,<br>,<n>,?,a,aaaay,abad,...,única,únicamente,únicas,único,únicos,úrsula,útero,útil,útiles,<unk>
!,0.000038,0.000038,0.000038,0.000038,0.000115,0.000038,0.000038,0.000038,0.000038,0.000038,...,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038
",",0.000488,0.000059,0.000020,0.000020,0.000332,0.000449,0.000254,0.000059,0.000059,0.000020,...,0.000059,0.000020,0.000020,0.000137,0.000020,0.003962,0.000020,0.000020,0.000020,0.000020
:,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,...,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038
;,0.000038,0.000038,0.000038,0.000038,0.000038,0.000115,0.000038,0.000038,0.000038,0.000038,...,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038
<br>,0.000123,0.000025,0.000025,0.000025,0.000025,0.000123,0.000123,0.000123,0.000025,0.000025,...,0.000025,0.000025,0.000025,0.000025,0.000025,0.002242,0.000025,0.000074,0.000074,0.000025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
úrsula,0.000037,0.003337,0.000112,0.000037,0.007387,0.000037,0.000037,0.003037,0.000037,0.000037,...,0.000037,0.000037,0.000037,0.000037,0.000037,0.000037,0.000037,0.000037,0.000037,0.000037
útero,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,...,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039
útil,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,...,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039
útiles,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,...,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039


## Trigrams

Intensive memory usage 

## 4-grams
Intensive memory usage

# Perplexity Evalution

## Tokenization testing corpus 

In [17]:
text = testing_text.lower()
text = re.sub(r'[-)(\s«»"]+', ' ', text) # Ignored characters
text = re.sub(r'\.+\s*', END_SENTENCE, text) # Replace end of sentences
text = re.sub(r'\d+', '<n>', text) # Unify numbers

test_tokens = re.finditer(r'[\wáéíóúñ]+|<(n|br)>|[,¿?!¡;:]', text)
test_tokens = pd.Series([m.group(0) for m in test_tokens])

test_tokens

0               el
1           horror
2                y
3               la
4        fatalidad
           ...    
14298          has
14299    asesinado
14300           tú
14301        mismo
14302            !
Length: 14303, dtype: object

## Unigrams test
Due number arer biger than the datatype does not support big numbers here we arer using log scale to multiply inverse token probability
$$
    \prod^N_{i=0} p_i = exp \left ( \sum^N_{i=0} log(p_i) \right )
$$
In the case of the unigrams example 
$$
    \sqrt[14303]{\prod^{14302}_{i = 0} log \left (\frac{1}{factors_i} \right )} = \sqrt[14303]{exp(89969.27206274212)}
$$
$$
    \sqrt[14303]{exp(89969.27206274212)} = 539.281585507262
$$

In [18]:
factors = []
N = len(test_tokens)

for w1 in test_tokens:
    _, p = unigrams.loc[w1] if w1 in unigrams.index else unigrams.loc[UNK]

    factors += [
        np.log(1 / p)
    ]

factors = np.array(factors)
perplexity = factors.sum()

display(Markdown('''$$
    \\sqrt[%i]{\\prod^{%i}_{i = 0} log \\left (\\frac{1}{factors_i} \\right )} = \\sqrt[%i]{exp(%s)}
$$''' % (N, N-1, N, perplexity)))

$$
    \sqrt[14303]{\prod^{14302}_{i = 0} log \left (\frac{1}{factors_i} \right )} = \sqrt[14303]{exp(101186.8022213538)}
$$

## Bigrams
In the case of the bigrams example 
$$
    \sqrt[14302]{\prod^{14301}_{i = 0} log \left (\frac{1}{factors_i} \right )} = \sqrt[14302]{exp(119494.189417991)}
$$
$$
    \sqrt[14302]{exp(119494.189417991)} = 4251.6757621947
$$

In [19]:
test_bigrams = np.fromiter(nltk.ngrams(test_tokens, 2), dtype=('<U21, <U21'))
N = len(test_bigrams)

factors = []
for w1, w2 in test_bigrams:
    w1 = w1 if w1 in bigrams.index else UNK
    w2 = w2 if w2 in bigrams else UNK
    p = bigrams[w1].loc[w2]
    factors += [
        np.log(1/p)
    ]

factors = np.array(factors)
perplexity = factors.sum()

display(Markdown('''$$
    \\sqrt[%i]{\\prod^{%i}_{i = 0} log \\left (\\frac{1}{factors_i} \\right )} = \\sqrt[%i]{exp(%s)}
$$''' % (N, N-1, N, perplexity)))

$$
    \sqrt[14302]{\prod^{14301}_{i = 0} log \left (\frac{1}{factors_i} \right )} = \sqrt[14302]{exp(119494.189417991)}
$$