# Final

Notebook, en el que se usan todos los modelos

In [1]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cusolver_cu1

In [2]:
import joblib

import torch

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from transformers import BertForSequenceClassification, BertTokenizer

class Models:

    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.bert_topic_model = BERTopic.load("/kaggle/input/bertopic/transformers/default/1/modelo_bertopic",
        embedding_model="all-MiniLM-L6-v2" )

        self.lda_model = LdaModel.load("/kaggle/input/topic_modeling_lda/other/default/1/topic_modeling_lda/model/lda_model_gensim.model")
        self.lda_dictionary = Dictionary.load("/kaggle/input/topic_modeling_lda/other/default/1/topic_modeling_lda/dic/diccionario.dict")
        
        self.svm_sentiment_model = joblib.load("/kaggle/input/ml_sentiment/scikitlearn/default/1/bert_sentiment_model/svm/svm.joblib")
        self.knn_sentiment_model = joblib.load("/kaggle/input/ml_sentiment/scikitlearn/default/1/bert_sentiment_model/knn/knn.joblib")
        self.lr_sentiment_model = joblib.load("/kaggle/input/ml_sentiment/scikitlearn/default/1/bert_sentiment_model/lr/lr.joblib")
        self.nb_sentiment_model = joblib.load("/kaggle/input/ml_sentiment/scikitlearn/default/1/bert_sentiment_model/nb/nb.joblib")
        self.rf_sentiment_model = joblib.load("/kaggle/input/ml_sentiment/scikitlearn/default/1/bert_sentiment_model/rf/rf.joblib")

        self.bert_sentiment_tokenizer = BertTokenizer.from_pretrained("/kaggle/input/bert_sentiment/transformers/default/1/bert_sentiment_model/tokenizer/bert_tokenizer")
        self.bert_sentiment_model = BertForSequenceClassification.from_pretrained("/kaggle/input/bert_sentiment/transformers/default/1/bert_sentiment_model/model/bert_sentiment_model")
        self.id2label_sentiment = {0: "Negativo", 1: "Neutro", 2: "Positivo"}

        self.bert_fake_news_tokenizer = BertTokenizer.from_pretrained("/kaggle/input/bert_fake/transformers/default/1/bert_fake_news/tokenizer/bert_tokenizer")
        self.bert_fake_news_model = BertForSequenceClassification.from_pretrained("/kaggle/input/bert_fake/transformers/default/1/bert_fake_news/model/bert_fake_news_model")
        self.id2label_fake_news = {0: "Falso", 1: "Real"}

        self.svm_fake_news_model = joblib.load("/kaggle/input/ml_fake/scikitlearn/default/1/ml_fake_news/svm/svm.joblib")
        self.knn_fake_news_model = joblib.load("/kaggle/input/ml_fake/scikitlearn/default/1/ml_fake_news/knn/knn.joblib")
        self.lr_fake_news_model = joblib.load("/kaggle/input/ml_fake/scikitlearn/default/1/ml_fake_news/lr/lr.joblib")
        self.nb_fake_news_model = joblib.load("/kaggle/input/ml_fake/scikitlearn/default/1/ml_fake_news/nb/nb.joblib")
        self.rf_fake_news_model = joblib.load("/kaggle/input/ml_fake/scikitlearn/default/1/ml_fake_news/rf/rf.joblib")

    
    def predict_bert_topic(self,doc):
        topic, probs = self.bert_topic_model.transform(doc)
        print(probs)
        if topic != -1:  
            tema = self.bert_topic_model.get_topic(topic[0])  
            palabras = [palabra for palabra, _ in tema]
            print(f"Documento: {doc}\n→ Tópico {topic[0]}: {', '.join(palabras[:10])}\n")
        else:
            print(f"Documento: {doc}\n→ Tópico: No asignado\n")
                
    def __get_topics_lda(self,num_words=10):
        topic_labels = {}
        num_topics = self.lda_model.num_topics
        topics = self.lda_model.show_topics(num_topics=num_topics, formatted=False, num_words=num_words)
        for topic_id, words in topics:
            label = ", ".join([word for word, _ in words])
            topic_labels[topic_id] = label
        return topic_labels

    def __preprocess_lda(self,docs):
        return [word for word in simple_preprocess(docs) if word not in stop_words]

    def __predict_topic_with_label(self,text, topic_labels):
        bow = self.lda_dictionary.doc2bow(self.__preprocess_lda(text))
        topic_dist = self.lda_model.get_document_topics(bow)
        if not topic_dist:
            return None, None
        main_topic_id = sorted(topic_dist, key=lambda x: -x[1])[0][0]
        return main_topic_id, topic_labels.get(main_topic_id, "Tema desconocido")

    def predict_lda_topic(self, texto):
        topic_labels = self.__get_topics_lda()
        
        topic_id, topic_name = self.__predict_topic_with_label(texto, topic_labels)
        
        print(f"Tema predicho: {topic_id} - {topic_name}")

    def __predict_bert(self, texto, model, tokenizer, max_lenght, id2label):
        inputs = tokenizer(
                texto,
                add_special_tokens=True,
                max_length=max_lenght,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
    
        # Mover los tensores al dispositivo (CPU o GPU)
        input_ids = inputs["input_ids"].to(self.device)
        attention_mask = inputs["attention_mask"].to(self.device)
    
        # Desactivar gradientes para inferencia
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred_id = torch.argmax(logits, dim=1).item()
    
        return id2label[pred_id]
        #return pred_id

    def predict_bert_sentiment(self, text):
        return self.__predict_bert(text, self.bert_sentiment_model, self.bert_sentiment_tokenizer, 128,self.id2label_sentiment)

    def predict_ml_sentiment(self, text, slc_model = 'svm'):
        if(slc_model == 'svm'):
            pred = self.id2label_sentiment[self.svm_sentiment_model.predict([text])[0]]
        elif(slc_model == 'knn'):
            pred = self.id2label_sentiment[self.knn_sentiment_model.predict([text])[0]]
        elif(slc_model == 'lr'):
            pred = self.id2label_sentiment[self.lr_sentiment_model.predict([text])[0]]
        elif(slc_model == 'nb'):
            pred = self.id2label_sentiment[self.nb_sentiment_model.predict([text])[0]]
        elif(slc_model == 'rf'):
            pred = self.id2label_sentiment[self.rf_sentiment_model.predict([text])[0]]
        return pred

    def predict_bert_fake_news(self, text):
        return self.__predict_bert(text, self.bert_fake_news_model, self.bert_fake_news_tokenizer, 512,self.id2label_fake_news)

    def predict_ml_fake_news(self, text, slc_model = 'svm'):
        if(slc_model == 'svm'):
            pred = self.id2label_fake_news[self.svm_fake_news_model.predict([text])[0]]
        elif(slc_model == 'knn'):
            pred = self.id2label_fake_news[self.knn_fake_news_model.predict([text])[0]]
        elif(slc_model == 'lr'):
            pred = self.id2label_fake_news[self.lr_fake_news_model.predict([text])[0]]
        elif(slc_model == 'nb'):
            pred = self.id2label_fake_news[self.nb_fake_news_model.predict([text])[0]]
        elif(slc_model == 'rf'):
            pred = self.id2label_fake_news[self.rf_fake_news_model.predict([text])[0]]
        return pred

2025-07-13 09:48:09.541196: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752400089.767598      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752400089.834385      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
import time

model = Models()

start = time.time()

texto = "A controversial new study published by the non-peer-reviewed journal Global Wellness Watch claims that major coffee brands have been embedding microscopic tracking chips inside coffee beans since 2021. The alleged goal: to monitor consumer drinking habits and location data in real time."

model.predict_bert_topic(texto)
end = time.time()

elapsed_ms = (end - start) * 1000  # convertir a milisegundos
print(f"Tiempo transcurrido: {elapsed_ms:.2f} ms")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-07-13 09:53:12,485 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-07-13 09:53:12,495 - BERTopic - Dimensionality - Completed ✓
2025-07-13 09:53:12,496 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-07-13 09:53:12,498 - BERTopic - Cluster - Completed ✓


[0.89869101]
Documento: A controversial new study published by the non-peer-reviewed journal Global Wellness Watch claims that major coffee brands have been embedding microscopic tracking chips inside coffee beans since 2021. The alleged goal: to monitor consumer drinking habits and location data in real time.
→ Tópico 33: coffee, tea, starbucks, cup, espresso, iced, cups, good, having, baristas

Tiempo transcurrido: 57.63 ms
