## Python modules

In [2]:
import os
import ast
import pandas as pd
import numpy as np

from typing import List, Union
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px
from sklearn.preprocessing import normalize
import seaborn as sns
import matplotlib.pyplot as plt

## Data

In [3]:
directorio_actual = os.getcwd()
file_cl           = os.path.join(directorio_actual, 'data', 'processed', 'dataset_cl.csv')

def parse_array_string(array_string):
    try:
        return ast.literal_eval(array_string)
    except ValueError:
        return []
    
dataset = pd.read_csv(file_cl, converters={'Header_Tags': parse_array_string})
dataset['Fecha_Publicacion'] = pd.to_datetime(dataset['Fecha_Publicacion'])

dataset['data_preprocess'] = dataset['data_preprocess'].astype(str)

print(dataset.head())

  Titulo_Seccion   Fecha_Publicacion  \
0       Economía 2024-01-20 12:32:00   
1      Seguridad 2024-01-05 05:59:00   
2       Economía 2023-12-11 15:26:00   
3        Sucesos 2023-08-26 16:59:00   
4       Política 2023-05-09 16:41:00   

                                         Header_Tags  \
0  [#Apagón, #crisis energética, #Ecuador, #Energ...   
1  [#Atentado, #Caso Metástasis, #Diana Salazar, ...   
2      [#Agrocalidad, #alimentos, #ARCSA, #empresas]   
3  [#droga, #Ecuador, #Guillermo Lasso, #narcotrá...   
4  [#Caso Odebrecht, #Jorge Glas, #reparación int...   

                                     data_preprocess  \
0   anuncio invertirar   516  obra infraestructur...   
1  conversacion narcotraficante leandro norero fo...   
2  agencia regulacion control fito zoosanitario a...   
3    incautar 501 tonelada droga ultimo 27 mes pe...   
4  despu cinco  dictar sentencia caso odebrecht n...   

                                      data_tokenized  
0  ['anuncio', 'invertirar', '

In [4]:
empty_rows = dataset[dataset.isnull().all(axis=1)]
print(empty_rows)

Empty DataFrame
Columns: [Titulo_Seccion, Fecha_Publicacion, Header_Tags, data_preprocess, data_tokenized]
Index: []


## Model BERT

In [5]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

## PARAMS

In [None]:
vectorizer_model = CountVectorizer(ngram_range=(1, 2))
ctfidf_model = ClassTfidfTransformer()
representation_model = KeyBERTInspired()
min_topic_size = 10

## Model distiluse-base-multilingual-cased-v2 512 DIM

In [None]:
embedding_a = SentenceTransformer("distiluse-base-multilingual-cased-v2")
model_a = BERTopic(
    embedding_model=embedding_a,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    min_topic_size=min_topic_size,
    verbose=True,
    language="multilingual",
    calculate_probabilities=True
)
topics_a, probs_a = model_a.fit_transform(dataset["data_preprocess"])

## MODEL paraphrase-multilingual-MiniLM-L12-v2 384 DIM

In [None]:
embedding_b = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
model_b = BERTopic(
    embedding_model=embedding_b,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    min_topic_size=min_topic_size,
    verbose=True,
    language="multilingual",
    calculate_probabilities=True
)
topics_b, probs_b = model_b.fit_transform(dataset["data_preprocess"])

In [None]:
directorio_actual = os.getcwd()
model_pwd_a = os.path.join(directorio_actual, 'models', 'model_a_v3.12')
model_pwd_b = os.path.join(directorio_actual, 'models', 'model_b_v3.12')

# model_a.save("./models/model_a_v3.12")
# model_b.save("./models/model_b_v3.12")

model_a = BERTopic.load(model_pwd_a)
model_b = BERTopic.load(model_pwd_b)

## Coherence

In [7]:
import gensim
from gensim.models.coherencemodel import CoherenceModel

def calcular_coherencia(model, docs):
    topic_info = model.get_topic_info()
    topic_words = [model.get_topic(topic) for topic in topic_info.Topic if topic != -1]
    topic_words = [[word for word, _ in words] for words in topic_words]
    texts = [doc.split() for doc in docs]
    dictionary = gensim.corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    topic_words_ids = [[dictionary.token2id[word] for word in words if word in dictionary.token2id] for words in topic_words]
    topic_words_ids = [topic for topic in topic_words_ids if len(topic) > 0]
    coherence_model = CoherenceModel(topics=topic_words_ids, texts=texts, dictionary=dictionary, coherence="c_v")
    return coherence_model.get_coherence()

coh_a = calcular_coherencia(model_a, dataset["data_preprocess"])
coh_b = calcular_coherencia(model_b, dataset["data_preprocess"])

print(f"distiluse-base-multilingual-cased-v2): {coh_a:.4f}")
print(f"\nparaphrase-multilingual-MiniLM-L12-v2): {coh_b:.4f}")

distiluse-base-multilingual-cased-v2): 0.8552

paraphrase-multilingual-MiniLM-L12-v2): 0.8552


In [8]:
model = model_a

In [None]:
freq = model.get_topic_info()
print("Number of topics: {}".format( len(freq)))
freq.head()

In [None]:
a_topic = freq.iloc[0]["Topic"]
model.get_topic(a_topic) # Show the words and their c-TF-IDF scores

In [None]:
model.visualize_barchart(top_n_topics=20)

In [None]:
model.visualize_topics()

In [None]:
model.visualize_hierarchy(top_n_topics=20)

In [None]:
similar_topics, similarity = model.find_topics("quito", top_n = 5)

most_similar = similar_topics[0]
print("Most Similar Topic Info: \n{}".format(model.get_topic(most_similar)))
print("Similarity Score: {}".format(similarity[0]))

## Weak topics

In [25]:
file_weak_signals = os.path.join(directorio_actual, 'data', 'processed', 'weak_signals_kem_kim.csv')
weak_signals_df = pd.read_csv(file_weak_signals)

weak_signals_sorted = weak_signals_df.sort_values(by=['DoV', 'DoD'], ascending=False)

top_n = 2000
weak_keywords = weak_signals_sorted.head(top_n)['Keyword'].tolist()
#weak_keywords = weak_signals_df['Keyword'].tolist()


freq = model.get_topic_info()
topics_keywords = {}
for topic in freq['Topic']:
    if topic != -1:  # Excluir documentos no clasificados
        topics_keywords[topic] = [word for word, _ in model.get_topic(topic)]

weak_topics = []
for topic, keywords in topics_keywords.items():
    if any(keyword in weak_keywords for keyword in keywords):
        weak_topics.append(topic)

print(f"Weak topics: {weak_topics}")

Weak topics: [41, 52, 54, 61, 64, 126, 143, 160]


In [None]:
specific_topics = [84, 139, 90]

def get_keywords_for_topics(model, topics):
    topics_keywords = {}
    for topic in topics:
        if topic != -1:
            topics_keywords[topic] = [word for word, _ in model.get_topic(topic)]
    return topics_keywords


keywords = get_keywords_for_topics(model, specific_topics)
for topic, words in keywords.items():
    print(f"Tópico {topic}: {words}")

"You are an expert in summarizing topics based on keywords. Given the following keywords, provide a concise and descriptive name for the topic, and a short description of what the topic covers."

Keywords: ['tecnología', 'innovación', 'startup', 'inversión', 'emprendimiento', 'fintech', 'disrupción', 'digital', 'ecosistema', 'mercado']

In [27]:
# model.visualize_documents(dataset['data_preprocess'])

In [None]:
#https://radimrehurek.com/gensim/models/coherencemodel.html

In [None]:
#https://ar5iv.labs.arxiv.org/html/2208.09299
#https://ar5iv.labs.arxiv.org/html/1403.6397#S3.T2

## Topics over Time

In [35]:
num_nr_bins = dataset['Fecha_Publicacion'].dt.to_period("M").nunique()
news = dataset.data_preprocess.to_list()   
print(type(news))

timestampsd = dataset.Fecha_Publicacion.to_list()
print(type(timestampsd))


topics_over_timet = model.topics_over_time(
    news, 
    timestampsd, 
    nr_bins=24,  
    global_tuning=False,
    evolution_tuning=True
)

<class 'list'>
<class 'list'>


24it [35:39, 89.16s/it] 


In [None]:
model.visualize_topics_over_time(topics_over_timet, top_n_topics=10, normalize_frequency=True)

In [44]:
model.visualize_topics_over_time(topics_over_timet, topics=[41, 52, 54, 61, 64, 126, 143, 160], normalize_frequency=True)

In [None]:
model.visualize_term_rank()

In [None]:
def visualize_term_rank_data(topic_model, topics: List[int] = None, log_scale: bool = False) -> List[dict]:
    topics      = [] if topics is None else topics
    topic_ids   = topic_model.get_topic_info().Topic.unique().tolist()
    topic_words = [topic_model.get_topic(topic) for topic in topic_ids]

    values  = np.array([[value[1] for value in values] for values in topic_words])
    indices = np.array([[value + 1 for value in range(len(values))] for values in topic_words])

    data  = []
    for topic, x, y in zip(topic_ids, indices, values):
        if not any(y > 1.5):
            label = f"<b>Topic {topic}</b>:" + "_".join([word[0] for word in topic_model.get_topic(topic)])
            label = label[:50]

            color = "red" if topic in topics else "black"
            opacity = 1 if topic in topics else .1
            if any(y == 0):
                y[y == 0] = min(values[values > 0])
            y = np.log10(y, out=y, where=y > 0) if log_scale else y
            data.append({'x':x, 'y':y,'text':label})
    return data

In [None]:
datos_ndcg = visualize_term_rank_data(model)

In [41]:
def visualize_topics_over_time_data(topic_model,
                               topics_over_time: pd.DataFrame,
                               top_n_topics: int = None,
                               topics: List[int] = None,
                               normalize_frequency: bool = True,
                               custom_labels: Union[bool, str] = False):

    freq_df = topic_model.get_topic_freq()
    freq_df = freq_df.loc[freq_df.Topic != -1, :]
    if topics is not None:
        selected_topics = list(topics)
    elif top_n_topics is not None:
        selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        selected_topics = sorted(freq_df.Topic.to_list())

    
    topic_names = {key: value[:40] + "..." if len(value) > 40 else value
                       for key, value in topic_model.topic_labels_.items()}
    topics_over_time["Name"] = topics_over_time.Topic.map(topic_names)
    data = topics_over_time.loc[topics_over_time.Topic.isin(selected_topics), :].sort_values(["Topic", "Timestamp"])

    dfs_per_topic  = []
    data_over_time = []
    for index, topic in enumerate(data.Topic.unique()):
        trace_data = data.loc[data.Topic == topic, :]
        topic_name = trace_data.Name.values[0]
        words = trace_data.Words.values
        if normalize_frequency:
            y = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
        else:
            y = trace_data.Frequency
        #data_over_time.append({'x':trace_data.Timestamp, 'y':y,'text':[f'<b>Topic {topic}</b><br>Words: {word}' for word in words]})
        data_over_time.append({'x':trace_data.Timestamp, 'y': y})
        df_topic = pd.DataFrame({'date': trace_data.Timestamp.values, 'frequency': y})
        dfs_per_topic.append(df_topic)
    
    layout = go.Layout(title='Topics Over Time',
                       xaxis=dict(title='Time'),
                       yaxis=dict(title='Frequency'))

    fig = go.Figure(data=data_over_time, layout=layout)
    return fig, dfs_per_topic

In [43]:
fig, data_over_time_ds = visualize_topics_over_time_data(model, topics_over_timet, topics=[0, 1, 2, 41, 52, 54, 61, 64, 126, 143, 160])
pio.show(fig)

In [45]:
data_over_time_ds[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       24 non-null     datetime64[ns]
 1   frequency  24 non-null     float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 512.0 bytes


In [46]:
import pickle

directorio_actual = os.getcwd()
data_ts           = os.path.join(directorio_actual, 'data', 'processed', 'data_over_time_ds.pickle')

with open(data_ts, 'wb') as f:
    pickle.dump(data_over_time_ds, f)