In [1]:
# !pip install spacy

In [2]:
# !python -m spacy download es_core_news_md

In [3]:
# !pip install spacy-affixes
# !python -m spacy_affixes download es 4.1

In [4]:
from pathlib import Path
import spacy

##### Load spacy-affixes for spanish

In [5]:
from spacy_affixes import AffixesMatcher
nlp = spacy.load("es_core_news_md")
affixes_matcher = AffixesMatcher(nlp, split_on=["VERB"])
nlp.add_pipe(affixes_matcher, name="affixes", before="tagger")

#### Clean text

In [6]:
spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS
# print('Number of stop words: %d' % len(spacy_stopwords))
# print('First ten stop words: %s' % list(spacy_stopwords)[:10])
customize_stop_words = ["\n"]
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

In [7]:
def prepare_text_for_lda(text):
    # tokens = tokenize(text)
    doc = nlp(text)
    tokens =  []
    tokens = []
    tokens = [token for token in doc if len(token.text) > 3]
    tokens = [token.lemma_ for token in tokens if not token.is_stop]
    # tokens = [token for token in tokens if token.pos_ in  ['NOUN', 'VERB', 'ADJ']]
    return tokens

In [8]:
text_data = []
for filename in (Path("corpus/es/")).rglob('*.txt'):
    with open(filename, "r") as poem_txt:
        poem = poem_txt.read()
        poem = poem.replace("[^a-zA-Z#]", " ")
        poem = " ".join(poem.split())
        tokens = prepare_text_for_lda(poem.lower())
        text_data.append(tokens)

In [9]:
# text_data[1]

###### LDA with Gensim

In [10]:
# !pip install gensim

In [11]:
from gensim import corpora
import pickle
import gensim

unable to import 'smart_open.gcs', disabling that module


In [12]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [13]:
len(corpus)

44

In [14]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=30)
ldamodel.save('model1.gensim')

In [15]:
topics = ldamodel.print_topics(num_words=8)
for topic in topics:
    print(topic)

(0, '0.013*"sentir" + 0.011*"amor" + 0.011*"volver" + 0.009*"esperar" + 0.009*"creer" + 0.008*"distanciar" + 0.008*"miedo" + 0.008*"mundo"')
(1, '0.018*"resistir" + 0.014*"vida" + 0.010*"piel" + 0.008*"vivir" + 0.008*"manir" + 0.008*"silenciar" + 0.008*"calor" + 0.008*"volver"')
(2, '0.093*"quedo" + 0.069*"casar" + 0.018*"coronavirus" + 0.015*"soñar" + 0.013*"mirar" + 0.012*"entender" + 0.012*"meter" + 0.012*"lío"')
(3, '0.022*"gro" + 0.017*"vida" + 0.010*"aire" + 0.010*"fuerte" + 0.010*"cantar" + 0.010*"volver" + 0.010*"amor" + 0.008*"volar"')
(4, '0.042*"coronavirus" + 0.010*"salir" + 0.009*"poner" + 0.009*"vida" + 0.009*"sentir" + 0.009*"mundo" + 0.009*"cumbia" + 0.009*"mano"')


##### Visualizating with pyLDAvis

In [16]:
# !pip install pyLDAvis

In [17]:
import pyLDAvis.gensim

In [18]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))

lda = gensim.models.ldamodel.LdaModel.load('model1.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)