# Topic_Modeling_LDA

Notebook de entrenamiento de LDA para modelado de temas

In [None]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from gensim.models import CoherenceModel
from gensim.corpora import MmCorpus


stop_words = set(stopwords.words('english'))
import os

In [None]:
def clean_text(text):
    import re

    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r"@\w+", '', text)  # Remove mentions
    text = re.sub(r"#", '', text)  # Remove hashtag symbol only
    return text.strip().lower()

In [None]:
df1 = pd.read_csv("/kaggle/input/fake-or-real-news/fake_or_real_news.csv")
df1=df1['title']+"\n"+df1['text']

df2 = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/test.csv", encoding='ISO-8859-1')
df2 = df2['text']

df3 = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/train.csv", encoding='ISO-8859-1')
df3 = df3['text']

df = pd.concat([df1,df2,df3], ignore_index=True)

df = df.fillna('').astype(str)
df = df.astype(str).apply(clean_text)

#df = df['CleanText_BERT']


#docs_subset = df.sample(frac=0.5, random_state=42).to_list()  # 20% del total

docs = df.to_list()

docs = [
    [word for word in simple_preprocess(doc) if word not in stop_words]
    for doc in docs
]



In [None]:
dictionary = Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]
# Entrenar el modelo LDA
lda_model = LdaModel(corpus=corpus, id2word=dictionary)

In [None]:
for i in range(lda_model.num_topics):
    topic_words = lda_model.show_topic(i, topn=10)
    print(topic_words)


In [None]:
coherence_model = CoherenceModel(
    model=lda_model,
    texts=docs,
    dictionary=dictionary,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print(f"Coherence Score (c_v): {coherence_score:.4f}")

In [None]:
os.makedirs("/kaggle/working/topic_modeling_lda", exist_ok=True)
os.makedirs("/kaggle/working/topic_modeling_lda/model", exist_ok=True)
os.makedirs("/kaggle/working/topic_modeling_lda/dic", exist_ok=True)
os.makedirs("/kaggle/working/topic_modeling_lda/corpus", exist_ok=True)

lda_model.save("/kaggle/working/topic_modeling_lda/model/lda_model_gensim.model")
dictionary.save('/kaggle/working/topic_modeling_lda/dic/diccionario.dict')
MmCorpus.serialize('/kaggle/working/topic_modeling_lda/corpus/corpus.mm', corpus)
