# Topic_Modeling_Bertopic_opt

Optimizador del modelo BERTopic para el modelado de temas

In [None]:
!pip install bertopic

In [None]:
!pip install -qU langdetect

In [None]:
import pandas as pd
from bertopic import BERTopic
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP

from gensim.models import CoherenceModel
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.matutils import corpus2csc

from sentence_transformers import SentenceTransformer

In [None]:
def clean_for_bertopic(text):
    """
    Lightly clean tweets for BERTopic (BERT-based models).
    Preserves stopwords and sentence structure for semantic understanding.

    Removes: HTML tags, URLs, mentions, hashtag symbols.
    """
    import re

    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r"@\w+", '', text)  # Remove mentions
    text = re.sub(r"#", '', text)  # Remove hashtag symbol only
    return text.strip().lower()

In [None]:
df1 = pd.read_csv("/kaggle/input/fake-or-real-news/fake_or_real_news.csv")
df1=df1['title']+"\n"+df1['text']

df2 = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/test.csv", encoding='ISO-8859-1')
df2 = df2['text']

df3 = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/train.csv", encoding='ISO-8859-1')
df3 = df3['text']

df = pd.concat([df1,df2,df3], ignore_index=True)

df = df.fillna('').astype(str)
df = df.astype(str).apply(clean_for_bertopic)

#df = df['CleanText_BERT']


docs_subset = df.sample(frac=0.5, random_state=42).to_list()  # 20% del total

docs = df.to_list()



In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch

def semantic_coherence(topic_model, model_name='all-mpnet-base-v2', top_n=10):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = SentenceTransformer(model_name).to(device)
    
    scores = []
    for topic_id in topic_model.get_topic_info()["Topic"]:
        if topic_id == -1:
            continue
        topic_words = [word for word, _ in topic_model.get_topic(topic_id)[:top_n]]
        embeddings = model.encode(topic_words, convert_to_tensor=True, device=device)
        sim_matrix = util.pytorch_cos_sim(embeddings, embeddings).cpu().numpy()
        upper_triangle = sim_matrix[np.triu_indices_from(sim_matrix, k=1)]
        if len(upper_triangle) > 0:
            scores.append(np.mean(upper_triangle))
    
    return np.mean(scores)

In [None]:
import optuna
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic

def coherence_score(topic_model, docs_tokenized, dictionary):
    topics = [
        [w for w, _ in topic_model.get_topics()[tid]]
        for tid in topic_model.get_topic_info()["Topic"]
        if tid != -1 and tid in topic_model.get_topics()
    ]
    cm = CoherenceModel(
        topics=topics,
        texts=docs_tokenized,
        dictionary=dictionary,
        coherence="c_v"
    )
    return cm.get_coherence()

def objective(trial):
    n_nb  = trial.suggest_int("n_neighbors", 5, 45)
    nc    = trial.suggest_int("n_components", 5, 30)
    mcs   = trial.suggest_int("min_cluster_size", 5, 35)
    msamp = trial.suggest_int("min_samples", 5, mcs)
    embedding = trial.suggest_categorical("embedding", ["all-mpnet-base-v2", "all-MiniLM-L6-v2", "paraphrase-MiniLM-L3-v2"])

    vectorizer_model = CountVectorizer(stop_words="english")

    umap = UMAP(n_neighbors=n_nb, n_components=nc)
    hdb  = HDBSCAN(min_cluster_size=mcs, min_samples=msamp, prediction_data=True)

    topic_model = BERTopic(
        vectorizer_model=vectorizer_model,
        umap_model=umap,
        hdbscan_model=hdb,
        embedding_model=embedding,
        verbose=False,
        language='english'
    )

    topics, _ = topic_model.fit_transform(docs_subset)

    tokenized_docs_subset = [doc.split() for doc in docs_subset]
    dictionary_subset = Dictionary(tokenized_docs_subset)

    n_topics = len(set(topic for topic in topics if topic != -1))
    print(n_topics)
    coherence = semantic_coherence(topic_model)
    
    return  coherence, n_topics

study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(objective, n_trials=1000)

for trial in study.trials:
    print(trial.values)  # muestra los dos valores optimizados