In [1]:
import os

docs = os.listdir('news_downloaded')
docs = [doc for doc in docs if 'error' not in doc]

In [2]:
import pandas as pd

word_key = docs[6]

df = pd.read_csv(f'news_downloaded/{word_key}')

word_key

'Intervencion+extranjera+en+elecciones.csv'

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   busqueda      99 non-null     int64 
 1   link          99 non-null     object
 2   source        99 non-null     object
 3   publish_date  78 non-null     object
 4   title         99 non-null     object
 5   article       98 non-null     object
 6   summary       98 non-null     object
 7   keywords      99 non-null     object
dtypes: int64(1), object(7)
memory usage: 6.3+ KB


In [4]:
art = df.copy()
art = art[art['article'].notna()]
#art = art[['title', 'article']].reindex()
#art['article'][0]

# Preprocesamiento

In [5]:
from nltk.corpus import stopwords
from tqdm import tqdm
import concurrent.futures
import multiprocessing
import spacy
import nltk
import re

nltk.download('stopwords')
stopwords_es = set(stopwords.words('spanish'))
nlp = spacy.load("es_core_news_sm")

def remove_punctuation(text):
    text = re.sub(r'([^\w\s+]|[¿?\.\:\,\;\-\_\"\'\(\)\%\$]|[0-9])', ' ', text)
    return text

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords_es]
    return " ".join(filtered_words)

def lemmatize(text):
    doc = nlp(text)
    lemmatized_text = [token.lemma_ for token in doc]
    return lemmatized_text

def clean_text(text):
    text_preprocessed = text.lower()
    text_preprocessed = remove_punctuation(text_preprocessed)
    text_preprocessed = remove_stopwords(text_preprocessed)
    text_preprocessed = lemmatize(text_preprocessed)
    return text_preprocessed

def preprocess(text_list):
    
    texts_preprocess = []
    concurrency = multiprocessing.cpu_count()

    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
        articles = executor.map(clean_text, text_list)
    
        for article in tqdm(articles, total=len(text_list)):
            try:
                texts_preprocess.append(article)
            except:
                print('Error al procesar')
    return texts_preprocess

def _count_words(word_list):
    words = {}
    for word in word_list:
        if word in words:
            words[word] += 1
        else:
            words[word] = 1
    return words

def count_words(text_list):
    
    word_count = []
    concurrency = multiprocessing.cpu_count()

    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
        texts_list = executor.map(_count_words, text_list)
    
        for text in texts_list:
            try:
                word_count.append(text)
            except:
                print('Error al procesar')
    return word_count
        


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kamilo44/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
result = preprocess(art['article'])

100%|█████████████████████████████████████| 98/98 [00:07<00:00, 12.50it/s]


In [55]:
#result[0]

# Gensim doc2bow

In [7]:
import gensim

# Crear un diccionario a partir de los documentos preprocesados
dictionary = gensim.corpora.Dictionary(result)

dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=10)

bow_corpus = [dictionary.doc2bow(doc) for doc in result]


# TF-IDF

In [8]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break


[(0, 0.8792792626630263),
 (1, 0.3317656263522414),
 (2, 0.17081738618284215),
 (3, 0.1658828131761207),
 (4, 0.17081738618284215),
 (5, 0.17585585253260524)]


In [9]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [10]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.329*"millón" + 0.221*"tras" + 0.106*"cambio" + 0.085*"política" + 0.069*"dar" + 0.060*"acuerdo" + 0.057*"internacional" + 0.056*"todo" + 0.011*"último" + 0.004*"seguir"
Topic: 1 
Words: 0.419*"política" + 0.093*"cambio" + 0.091*"internacional" + 0.091*"último" + 0.081*"acuerdo" + 0.065*"todo" + 0.062*"tras" + 0.049*"seguir" + 0.040*"dar" + 0.009*"millón"
Topic: 2 
Words: 0.402*"millón" + 0.147*"último" + 0.100*"acuerdo" + 0.075*"cambio" + 0.073*"dar" + 0.071*"internacional" + 0.065*"seguir" + 0.028*"todo" + 0.027*"política" + 0.011*"tras"
Topic: 3 
Words: 0.504*"internacional" + 0.102*"política" + 0.082*"tras" + 0.079*"millón" + 0.070*"último" + 0.046*"seguir" + 0.041*"todo" + 0.036*"dar" + 0.028*"acuerdo" + 0.013*"cambio"
Topic: 4 
Words: 0.386*"seguir" + 0.123*"último" + 0.109*"política" + 0.103*"acuerdo" + 0.086*"tras" + 0.077*"cambio" + 0.059*"dar" + 0.043*"todo" + 0.012*"internacional" + 0.002*"millón"
Topic: 5 
Words: 0.481*"millón" + 0.160*"acuerdo" + 0.106*"c

In [11]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)


In [12]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.289*"millón" + 0.200*"internacional" + 0.121*"último" + 0.075*"acuerdo" + 0.066*"seguir" + 0.065*"política" + 0.064*"cambio" + 0.043*"tras" + 0.040*"dar" + 0.036*"todo"
Topic: 1 Word: 0.170*"internacional" + 0.160*"millón" + 0.121*"cambio" + 0.112*"seguir" + 0.106*"dar" + 0.094*"acuerdo" + 0.085*"política" + 0.063*"todo" + 0.062*"tras" + 0.027*"último"
Topic: 2 Word: 0.260*"millón" + 0.234*"todo" + 0.145*"cambio" + 0.083*"dar" + 0.070*"acuerdo" + 0.060*"último" + 0.047*"política" + 0.047*"internacional" + 0.030*"tras" + 0.024*"seguir"
Topic: 3 Word: 0.215*"todo" + 0.199*"política" + 0.185*"acuerdo" + 0.095*"último" + 0.095*"tras" + 0.070*"internacional" + 0.055*"cambio" + 0.043*"dar" + 0.035*"seguir" + 0.007*"millón"
Topic: 4 Word: 0.283*"millón" + 0.149*"tras" + 0.103*"acuerdo" + 0.101*"todo" + 0.095*"dar" + 0.086*"cambio" + 0.075*"último" + 0.051*"seguir" + 0.047*"internacional" + 0.009*"política"
Topic: 5 Word: 0.339*"tras" + 0.313*"último" + 0.138*"dar" + 0.048*"ca