In [1]:
import re
import nltk
import pandas as pd
import numpy as np

# Taller 2
## Token extraction

In [34]:
files = [
    '../files/allan_poe/berenice.txt',
    '../files/allan_poe/decenso_al_maelstron.txt',
    '../files/allan_poe/el_barril_de_amontillado.txt',
    '../files/allan_poe/el_diablo_en_el_campanario.txt',
    '../files/allan_poe/el_engaño_del_globo.txt',
    '../files/allan_poe/el_entierro_prematuro.txt',
    '../files/allan_poe/el_hombre_de_la_multitud.txt',
    '../files/allan_poe/el_retrato_oval.txt',
    '../files/allan_poe/gato_negro.txt',
    '../files/allan_poe/la_caída_de_la_casa_usher.txt',
    '../files/allan_poe/la_esfinje.txt',
    '../files/allan_poe/la_máscara_de_la_muerte_roja.txt',
    '../files/allan_poe/lady_ligeia.txt',
    '../files/allan_poe/metzengerstein.txt',
    '../files/allan_poe/morerlla.txt',
    '../files/allan_poe/william_wilson.txt',

    # '../Files/cien_soledad.txt',
    # '../Files/el_principito.txt',
    # '../Files/la_isla_del_tesoro.txt',
]
texts = []
for file_name in files:
    file = open(file_name, 'r')
    texts += [file.read()]
    file.close()

texts = pd.Series(texts, index=files)
text = ''.join(texts)
texts

../files/allan_poe/berenice.txt                        La desdicha es diversa. La desgracia cunde mul...
../files/allan_poe/decenso_al_maelstron.txt            Habíamos alcanzado la cumbre del despeñadero m...
../files/allan_poe/el_barril_de_amontillado.txt        Lo mejor que pude había soportado las mil inju...
../files/allan_poe/el_diablo_en_el_campanario.txt      Todos saben de una manera vaga que el lugar má...
../files/allan_poe/el_engaño_del_globo.txt             ¡Asombrosas noticias por expreso, vía Norfolk!...
../files/allan_poe/el_entierro_prematuro.txt           Hay ciertos temas de interés absorbente, pero ...
../files/allan_poe/el_hombre_de_la_multitud.txt        Con razón se ha dicho de cierto libro alemán q...
../files/allan_poe/el_retrato_oval.txt                 El castillo en el cual mi criado se le había o...
../files/allan_poe/gato_negro.txt                      No espero ni pido que alguien crea en el extra...
../files/allan_poe/la_caída_de_la_casa_usher.txt       

In [36]:
text = text.lower()
text = re.sub(r'[-)(\s«»"]+', ' ', text) # Ignored characters
text = re.sub(r'\.\s*', '<s>', text) # Replace end of sentences

tokens = re.finditer(r'[\wáéíóúñ]+|<s>|[,¿?!¡;:]', text)
tokens = pd.Series([m.group(0) for m in tokens])

tokens

0               la
1         desdicha
2               es
3          diversa
4              <s>
           ...    
71088          has
71089    asesinado
71090           tú
71091        mismo
71092            !
Length: 71093, dtype: object

# 
## Unigrams

In [40]:
unigrams, counts = np.unique(tokens, return_counts=True)
unigrams = pd.DataFrame(counts, index=unigrams, columns=['count'])

unigrams['prob'] = unigrams['count'] / unigrams['count'].sum()
unigrams['cumsum'] = unigrams['prob'].cumsum()

unigrams

Unnamed: 0,count,prob,cumsum
!,224,0.003151,0.003151
",",4834,0.067995,0.071146
000,4,0.000056,0.071203
010,1,0.000014,0.071217
10,1,0.000014,0.071231
...,...,...,...
única,13,0.000183,0.999719
únicamente,1,0.000014,0.999733
únicas,1,0.000014,0.999747
único,16,0.000225,0.999972


## Bigrams

In [43]:
bigrams = np.fromiter(nltk.ngrams(tokens, 2), dtype=('<U18, <U18'))
bigrams, counts = np.unique(bigrams, return_counts=True, axis=0)

bigrams = pd.MultiIndex.from_tuples(list(bigrams), names=['w1', 'w2'])

bigrams = pd.DataFrame(counts, index=bigrams, columns=['count'])
bigrams['prfix-sum'] = bigrams['count'].groupby(level='w1').transform(np.sum)

bigrams['prob'] = bigrams['count'] / bigrams['prfix-sum']
bigrams['cumsum'] = bigrams.prob.groupby(level=['w1']).cumsum()

bigrams

Unnamed: 0_level_0,Unnamed: 1_level_0,count,prfix-sum,prob,cumsum
w1,w2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
!,",",11,223,0.049327,0.049327
!,<s>,2,223,0.008969,0.058296
!,?,1,223,0.004484,0.062780
!,a,2,223,0.008969,0.071749
!,al,3,223,0.013453,0.085202
...,...,...,...,...,...
único,refugio,1,16,0.062500,0.875000
único,sonido,1,16,0.062500,0.937500
único,tema,1,16,0.062500,1.000000
únicos,presentes,1,2,0.500000,0.500000


## Trigrams

In [45]:
trigrams = np.fromiter(nltk.ngrams(tokens, 3), dtype=('<U18, <U18, <U18'))
trigrams, counts = np.unique(trigrams, return_counts=True, axis=0)

trigrams = pd.DataFrame(trigrams)
trigrams['w1'] = trigrams[['f0', 'f1']].apply(" ".join, axis=1)
trigrams['w2'] = trigrams[['f2']]
trigrams = pd.MultiIndex.from_frame(trigrams[['w1', 'w2']])

trigrams = pd.DataFrame(counts, index= trigrams, columns=['count'])
trigrams['prfix-sum'] = trigrams['count'].groupby(level='w1').transform(np.sum)

trigrams['prob'] = trigrams['count'] / trigrams['prfix-sum']
trigrams['cumsum'] = trigrams.prob.groupby(level=['w1']).cumsum()

trigrams

Unnamed: 0_level_0,Unnamed: 1_level_0,count,prfix-sum,prob,cumsum
w1,w2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"! ,",a,1,11,0.090909,0.090909
"! ,",acaso,2,11,0.181818,0.272727
"! ,",fueron,1,11,0.090909,0.363636
"! ,",la,1,11,0.090909,0.454545
"! ,",los,1,11,0.090909,0.545455
...,...,...,...,...,...
único refugio,asequible,1,1,1.000000,1.000000
único sonido,en,1,1,1.000000,1.000000
único tema,de,1,1,1.000000,1.000000
únicos presentes,a,1,1,1.000000,1.000000


## 4-grams

In [9]:
fourgrams = np.fromiter(nltk.ngrams(tokens, 4), dtype=('<U18, <U18, <U18, <U18'))
fourgrams, counts = np.unique(fourgrams, return_counts=True, axis=0)

fourgrams = pd.DataFrame(fourgrams)
fourgrams['w1'] = fourgrams[['f0', 'f1', 'f2']].apply(" ".join, axis=1)
fourgrams['w2'] = fourgrams[['f3']]
fourgrams = pd.MultiIndex.from_frame(fourgrams[['w1', 'w2']])

fourgrams = pd.DataFrame(counts, index= fourgrams, columns=['count'])
fourgrams['prfix-sum'] = fourgrams['count'].groupby(level='w1').transform(np.sum)

fourgrams['prob'] = fourgrams['count'] / fourgrams['prfix-sum']
fourgrams['cumsum'] = fourgrams.prob.groupby(level=['w1']).cumsum()

fourgrams

Unnamed: 0_level_0,Unnamed: 1_level_0,count,prfix-sum,prob,cumsum
w1,w2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"! , a",propósito,1,1,1.0,1.0
"! , exclamó",<s>,1,1,1.0,1.0
"! , fueron",los,1,1,1.0,1.0
"! , gritó",una,1,1,1.0,1.0
"! , le",gritó,1,1,1.0,1.0
...,...,...,...,...,...
útiles <s> fue,entonces,1,1,1.0,1.0
útiles como hubiera,podido,1,1,1.0,1.0
útiles domésticos habían,sido,1,1,1.0,1.0
útiles domésticos y,el,1,1,1.0,1.0


In [46]:
unigrams[unigrams['cumsum'] > 0.97].iloc[-1]

count     2.000000
prob      0.000028
cumsum    1.000000
Name: únicos, dtype: float64