In [1]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.test.utils import datapath
import pandas as pd
import spacy
from spacy.language import Language

In [2]:
stopword_list = []

In [3]:
with open("vacias.txt", 'r') as f:
    stopword_list = f.read().split("\n")

In [4]:
# print(stopword_list)

In [None]:
@Language.component("remove_stopwords")
def remove_stopwords(doc):
    doc = [token for token in doc if not token.is_stop]
    return doc

In [None]:
@Language.component("remove_punct")
def remove_punct(doc):
    doc = [token for token in doc if not token.is_punct]
    return doc

In [None]:
@Language.component("remove_spaces")
def remove_spaces(doc):
    doc = [token for token in doc if not token.is_space]
    return doc

In [None]:
@Language.component("get_text")
def get_text(doc):
    doc = [token.text for token in doc]
    return doc

In [None]:
nlp = spacy.load('es_core_news_lg', exclude=["ner"])

In [None]:
print(len(nlp.Defaults.stop_words))

In [None]:
nlp.Defaults.stop_words |= set(stopword_list)

In [None]:
for key in stopword_list:
    nlp.vocab[key].is_stop = True

In [None]:
print(len(nlp.Defaults.stop_words))

In [None]:
rm_stopwords = nlp.add_pipe("remove_stopwords", name="rm_stopwords", last=True)
rm_punct = nlp.add_pipe("remove_punct", name="rm_punct", last=True)
rm_spaces = nlp.add_pipe("remove_spaces", name="rm_spaces", last=True)
get_text = nlp.add_pipe("get_text", name="get_text", last=True)

In [None]:
df = pd.read_parquet("./digital_identity_token_dataset.parquet", columns=['spotify_id', 'lyrics'])

In [None]:
df["doc"] = list(nlp.pipe(df["lyrics"]))

In [None]:
dictionary = Dictionary(df["doc"])

In [None]:
corpus = [dictionary.doc2bow(text) for text in df["doc"]]

In [None]:
lda = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [None]:
model_name = "./ldamodels/lda_text_no_stopwords"

In [None]:
lda.save(model_name)

In [None]:
lda = LdaModel.load(model_name)
dictionary = Dictionary.load(f"{model_name}.id2word")