In [99]:
#!pip install pyLDAvis
import gensim
import pyLDAvis
from pyLDAvis import gensim as gensimvis
from pyLDAvis import sklearn
import spacy

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import Normalizer
import pandas as pd
import numpy as np
import pprint as pp
import pickle
import tqdm
from collections import Counter
from nltk import ngrams, bigrams

In [5]:
with open('./datasets/borges_full_texts.pkl', 'rb') as f:
   data = pickle.load(f)

In [6]:
data.head()

Unnamed: 0,link,text_metadata,text
0,https://ciudadseva.com/texto/abel-y-cain-borges/,"{'title': 'Abel y Caín', 'metadata': '[Minicue...",Abel y Caín se encontraron después de la muert...
1,https://ciudadseva.com/texto/adrogue/,"{'title': 'Adrogué', 'metadata': '[Minicuento ...","Era muy lindo, un pueblo laberíntico. A veces,..."
2,https://ciudadseva.com/texto/alguien-sonara/,"{'title': 'Alguien soñará', 'metadata': '[Mini...",¿Qué soñará el indescifrable futuro? Soñará qu...
3,https://ciudadseva.com/texto/andres-armoa/,"{'title': 'Andrés Armoa', 'metadata': '[Minicu...",Los años le han dejado unas palabras en guaran...
4,https://ciudadseva.com/texto/argumentum-ornith...,"{'title': 'Argumentum ornithologicum', 'metada...",Cierro los ojos y veo una bandada de pájaros. ...


In [69]:
corpus_vectorizado = CountVectorizer( max_features = 40000, ngram_range=(1,3), min_df=3, max_df=0.4)

In [15]:
corpus_vectorizado

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.4, max_features=40000, min_df=3,
                ngram_range=(1, 3), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [70]:
lda = LatentDirichletAllocation(n_components = 30,learning_method='batch',max_iter=100)

In [71]:
corpus_para_lda = corpus_vectorizado.fit_transform(data.text)

In [54]:
def imprimir_palabras_mas_relevantes(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]+" || "
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [72]:
lda.fit(corpus_para_lda)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=100,
                          mean_change_tol=0.001, n_components=30, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [73]:
print("\nTopics in LDA model:")
tf_feature_names = corpus_vectorizado.get_feature_names()
n_top_words = 3
imprimir_palabras_mas_relevantes(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: juan ||  mujer ||  iba || 
Topic #1: sueño ||  desierto ||  luego || 
Topic #2: negro ||  que yo ||  cara || 
Topic #3: pedro ||  coronel ||  tan || 
Topic #4: que las ||  para el ||  el mundo || 
Topic #5: mapa ||  imperio ||  del imperio || 
Topic #6: serpiente ||  será ||  arca || 
Topic #7: vi ||  aleph ||  carlos || 
Topic #8: biblioteca ||  libros ||  la biblioteca || 
Topic #9: habían ||  padre ||  muchacha || 
Topic #10: que me ||  cuatro ||  palabra || 
Topic #11: infamia ||  nosotros ||  crucifixión de || 
Topic #12: interlocutor ||  hacia el alba ||  de la frontera || 
Topic #13: único ||  entrar ||  entonces la || 
Topic #14: jardín ||  iba ||  usted || 
Topic #15: único ||  entrar ||  entonces la || 
Topic #16: obra ||  han ||  segundo || 
Topic #17: máquina ||  sueño ||  cuerpo || 
Topic #18: infinito ||  azar ||  el libro || 
Topic #19: que me ||  te ||  gente || 
Topic #20: luego ||  recordó ||  pensó || 
Topic #21: ciudad ||  la ciudad |

# Lematizando

In [84]:
nlp = spacy.load("es_core_news_sm")

data_base = data.copy()

def lemmafy(doc):
    lemmas = []
    for token in doc:
        if not token.is_stop and token.is_alpha:
             lemmas.append(token.lemma_)
    return ' '.join(lemmas)

texts = []
for text in tqdm.tqdm(data_base["text"]):
    doc = nlp(text)
    lemmed = lemmafy(doc)
    texts.append(lemmed)

100%|██████████| 62/62 [00:10<00:00,  5.68it/s]


In [87]:
df_texts = pd.DataFrame({"texts":texts})
df_texts["texts"]

0     Abel y Caín encontrar muerte Abel Caminaban de...
1     lindar poblar laberíntico A noche verano salir...
2     soñar indescifrable futuro Soñará Alonso Quija...
3     año dejar palabra guaraní ocasión requerir tra...
4     Cierro ojo y ver bandada pájaro visión duro o ...
                            ...                        
57    I Debo a conjunción espejar y enciclopedia des...
58    Asia Menor o Alejandría siglo fe Basílides pub...
59    Imaginemos Toledo descubrir papel texto arábig...
60    desierto Irán alto torrar piedra puerta ventan...
61    cerro igualar tierra llanura y ir caminar llan...
Name: texts, Length: 62, dtype: object

In [94]:
lda_lemmed = LatentDirichletAllocation(n_components = 30,learning_method='batch',max_iter=100)

In [95]:
corpus_lemmed = corpus_vectorizado.fit_transform(df_texts.texts)

In [96]:
lda_lemmed.fit(corpus_lemmed)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=100,
                          mean_change_tol=0.001, n_components=30, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [98]:
print("\nTopics in LDA model AFTER LEMMING:")
tf_feature_names = corpus_vectorizado.get_feature_names()
n_top_words = 3
imprimir_palabras_mas_relevantes(lda_lemmed, tf_feature_names, n_top_words)


Topics in LDA model AFTER LEMMING:
Topic #0: universo ||  tigre ||  caminar || 
Topic #1: azar ||  número ||  luna || 
Topic #2: laberinto ||  jardín ||  bifurcar || 
Topic #3: único ||  enumerar ||  entró || 
Topic #4: biblioteca ||  libro ||  letra || 
Topic #5: quijote ||  novelar ||  capítulo || 
Topic #6: carlos ||  aleph ||  argentino || 
Topic #7: juan ||  tío ||  madre || 
Topic #8: cuerpo ||  máquina ||  ulises || 
Topic #9: cruz ||  juan ||  negro || 
Topic #10: judas ||  publicar ||  jesús || 
Topic #11: tomo ||  objeto ||  realidad || 
Topic #12: recuerdo ||  treinta ||  latín || 
Topic #13: único ||  enumerar ||  entró || 
Topic #14: monedar ||  pedro ||  enciclopedia || 
Topic #15: único ||  enumerar ||  entró || 
Topic #16: obrar ||  acto ||  autor || 
Topic #17: padre ||  fábrica ||  temor || 
Topic #18: padre ||  hijo ||  indio || 
Topic #19: único ||  enumerar ||  entró || 
Topic #20: abrir ||  señor ||  sacar || 
Topic #21: nombre ||  crimen ||  letra || 
Topic #22:

In [119]:
counts = []
for text in df_texts["texts"]:
    uni_tokens = ngrams(text.split(" "),1)
    #bi_tokens = bigrams(df_texts)
    #bi_counts = Counter(bi_tokens)
    uni_counts = Counter(uni_tokens)
    counts.append(uni_counts.most_common(1))
#print([(item, counts.count(item)) for item in sorted(set(bi_tokens))])
print("Unigrams:", counts)
#print("\n")
#print("Bigrams:", bi_counts.most_common(5))
print("\n")
#df_texts["texts"]

Unigrams: [[(('Abel',), 5)], [(('lindar',), 1)], [(('Soñará',), 9)], [(('y',), 5)], [(('ver',), 5)], [(('a',), 28)], [(('a',), 28)], [(('y',), 6)], [(('y',), 3)], [(('y',), 51)], [(('reprobar',), 2)], [(('y',), 98)], [(('y',), 12)], [(('y',), 49)], [(('y',), 27)], [(('y',), 80)], [(('y',), 129)], [(('y',), 101)], [(('y',), 34)], [(('y',), 51)], [(('y',), 63)], [(('y',), 4)], [(('y',), 4)], [(('y',), 6)], [(('y',), 94)], [(('y',), 68)], [(('y',), 9)], [(('y',), 35)], [(('y',), 68)], [(('a',), 55)], [(('y',), 55)], [(('y',), 119)], [(('y',), 3)], [(('y',), 42)], [(('y',), 74)], [(('y',), 31)], [(('y',), 24)], [(('y',), 53)], [(('y',), 34)], [(('y',), 39)], [(('o',), 7)], [(('y',), 37)], [(('y',), 89)], [(('y',), 68)], [(('a',), 6)], [(('y',), 24)], [(('y',), 8)], [(('y',), 60)], [(('tanto',), 1)], [(('y',), 19)], [(('y',), 17)], [(('y',), 71)], [(('y',), 11)], [(('y',), 7)], [(('y',), 7)], [(('y',), 70)], [(('y',), 28)], [(('y',), 159)], [(('y',), 44)], [(('y',), 15)], [(('y',), 3)], [((

## Remove stopwords!
https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python