## Python modules

In [1]:
import os
import time
import pandas as pd
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
import ast

In [2]:

# Cargar dataset ya preprocesado (sin necesidad de tokenizar)
file_path = os.path.join(os.getcwd(), 'data', 'processed', 'dataset_cl.csv')
dataset = pd.read_csv(file_path, converters={'Header_Tags': ast.literal_eval})
dataset['data_preprocess'] = dataset['data_preprocess'].astype(str)

# Convertir cada fila en lista de palabras tokenizadas
processed_texts = dataset['data_preprocess'].apply(lambda x: x.split())

# Crear diccionario y corpus
dictionary = corpora.Dictionary(processed_texts)
corpus = [dictionary.doc2bow(text) for text in processed_texts]

# Evaluar distintos valores de tópicos
num_topics_list = [50, 76, 100]
coherence_values = {}
training_times = {}

for num_topics in num_topics_list:
    start_time = time.time()
    lda_model = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        passes=10,
        iterations=25,
        chunksize=1000,
        alpha='auto',
        eta='auto',
        eval_every=None
    )
    end_time = time.time()
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    coherence_values[num_topics] = coherence_lda
    training_times[num_topics] = end_time - start_time



In [3]:
results_df = pd.DataFrame({
    "num_topics": list(coherence_values.keys()),
    "coherence_c_v": list(coherence_values.values()),
    "training_time_sec": list(training_times.values())
})

print(results_df)

   num_topics  coherence_c_v  training_time_sec
0          20       0.658837         191.579085
1          50       0.605647         830.773503
2          76       0.610223        1454.036999
3         100       0.603864        1496.691604
4         150       0.565469        1680.720284
