### Required libraries and downloads

- Spacy: 
    ```
    pip install spacy
    python -m spacy download es_core_news_md

    ```
- nltk: 
    ```
    pip install nltk

    ```
- Spacy-affixes: 
    ```
    pip install spacy-affixes
    python -m spacy_affixes download es 4.1

    ```
- Gensim:
    ```
    pip install gensim

    ```



### Load spacy and spacy-affixes for spanish

In [16]:
from pathlib import Path

import spacy 
from spacy_affixes import AffixesMatcher
from spacy_affixes.utils import AFFIXES_SUFFIX
from spacy_affixes.utils import load_affixes

nlp = spacy.load("es_core_news_md")

suffixes = {k: v for k, v in load_affixes().items()
            if k.startswith(AFFIXES_SUFFIX)} 
affixes_matcher = AffixesMatcher(nlp, split_on=["VERB"], rules=suffixes)
nlp.add_pipe(affixes_matcher, name="affixes", before="tagger")

# Preprocessing functions


### Load nltk stopword list

In [17]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/aitor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('spanish')) 
print('First ten stop words: %s' % list(stop_words)[:10])
print('Last ten stop words: %s' % list(stop_words)[-10:])
# Add new stopwords
customize_stop_words = ["\n"]
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

First ten stop words: ['hubieseis', 'como', 'fuisteis', 'algunos', 'qué', 'tenéis', 'ni', 'habré', 'estéis', 'les']
Last ten stop words: ['fueses', 'estaba', 'vuestra', 'tendrán', 'estuvieses', 'las', 'estas', 'habrán', 'mis', 'estarás']


### Lemmatize each token in text

In [19]:
def lemmatizer(doc):
    doc = [token.lemma_ for token in doc]
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [20]:
def remove_stopwords(doc):
    doc = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return doc

In [21]:
# Add the functions to default pipeline
nlp.add_pipe(lemmatizer, name="lemmatizer")
nlp.add_pipe(remove_stopwords, name="stopwords")

# Data entry

In [22]:
doc_list = []
# Iterate through each poem in folder
for filename in (Path("corpus/es/")).rglob('*.txt'):
    with open(filename, "r") as poem_txt:
        poem = poem_txt.read()
        poem = poem.replace("[^a-zA-Z#]", " ")
        poem = " ".join(poem.split())
        # quitar stopwords
        
        # devolver dataframe/listas con tokens / lemma_ / lexema / nº de documento / POS 
        
        #tokens = prepare_text_for_lda(poem.lower())
        tokens = nlp(poem)
        doc_list.append(tokens)
        

In [23]:
doc_list[3]

['Y',
 'volver',
 'sabor',
 'Mr',
 'Cumbia',
 'cumbia',
 'coronavirus',
 'mundo',
 'espantar',
 'enfermedad',
 'llamar',
 'coronavirus',
 'Y',
 'alarmar',
 'mundial',
 'nacer',
 'China',
 'muerto',
 'allá',
 'ponernos',
 'atento',
 'cuidar',
 'ponernos',
 'pilar',
 'Pa',
 'canción',
 'vida',
 'meme',
 'cumbia',
 'avión',
 'prevenirnos',
 'dato',
 'do',
 'deber',
 'cuidarnos',
 'Pongan',
 'atención',
 'Coronavirus',
 'coronavirus',
 'lavar',
 'mano',
 'háganlo',
 'seguir',
 'Coronavirus',
 'coronavirus',
 'pilar',
 'concurrir',
 'Coronavirus',
 'coronavirus',
 'tocar',
 'caro',
 'evítenlo',
 'amigo',
 'Coronavirus',
 'coronavirus',
 'Usen',
 'desinfectante',
 'efectivo',
 'Y',
 'rey',
 'cumbia',
 'virar',
 'Mr',
 'Cumbia',
 'mundo',
 'espantar',
 'enfermedad',
 'llamar',
 'coronavirus',
 'Y',
 'alarmar',
 'mundial',
 'nacer',
 'China',
 'muerto',
 'allá',
 'ponernos',
 'atento',
 'cuidar',
 'ponernos',
 'pilar',
 'Pa',
 'canción',
 'vida',
 'meme',
 'cumbia',
 'avión',
 'prevenirnos',
 

### LDA with Gensim

In [25]:
from gensim import corpora
import pickle
import gensim

unable to import 'smart_open.gcs', disabling that module


We first create the dictionary element, which is a list with unique tokens from text, and the corpus element, which is the component that reads documents from the file.

In [26]:
# Mapping of word IDs to words
dictionary = corpora.Dictionary(doc_list) # lista de listas, por ejemplo, diccionario con las palabras que sean x POS

# Turn each document into a bag of words
corpus = [dictionary.doc2bow(doc) for doc in doc_list]

pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [27]:
len(corpus)

44

Now we can train the LDA model. We use the gensim class LdaModel.
- num_topics: the number of topics we'd like to use. We set this to 3 here, but if you want you can experiment with a larger number of topics.
- passes: the number of iterations to use in the training algorithm.

This training step will take a few minutes

In [28]:
NUM_TOPICS = 3
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics = NUM_TOPICS, 
                                           id2word=dictionary, 
                                           passes=10,
                                           alpha='auto')
ldamodel.save('model2.gensim')

Show top 8 words in all the topics

In [30]:
topics = ldamodel.print_topics(num_words=8)
for topic in topics:
    print(topic)

(0, '0.048*"a" + 0.035*"y" + 0.013*"Y" + 0.010*"entrar" + 0.008*"comer" + 0.008*"vida" + 0.006*"querer" + 0.006*"o"')
(1, '0.024*"y" + 0.020*"Y" + 0.018*"coronavirus" + 0.014*"Coronavirus" + 0.011*"comer" + 0.009*"seguir" + 0.008*"o" + 0.008*"Resistiré"')
(2, '0.045*"quedo" + 0.034*"casar" + 0.027*"a" + 0.026*"y" + 0.017*"Y" + 0.011*"salir" + 0.009*"coronavirus" + 0.007*"vida"')


##### Visualizating with pyLDAvis

In [31]:
# !pip install pyLDAvis

In [32]:
import pyLDAvis.gensim

In [33]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))

lda = gensim.models.ldamodel.LdaModel.load('model2.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)