In [1]:
# Librerías para manipulación de datos
import pandas as pd
import numpy as np

# Librerías para procesamiento de texto
import re
import string
import langdetect
from langdetect import detect, DetectorFactory
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

# Extracción de palabras clave
import yake
from yake import KeywordExtractor
from rake_nltk import Rake

# Similitud semántica
from sentence_transformers import SentenceTransformer, util

# Visualización y análisis
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Modelado y evaluación
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Utilidades
from tqdm import tqdm
import os
import time
nltk.download('stopwords')


  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jhonr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# ==================== Configuración de semilla y dispositivo ====================
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Dispositivo en uso: {device}")

Dispositivo en uso: cuda


In [4]:
def limpiar_yake(texto):
    if not texto or texto.strip() == '':
        return 'Texto vacío'
    texto = BeautifulSoup(texto, "html.parser").get_text()
    texto = texto.replace("\n", " ").replace("\r", " ")
    texto = re.sub(r'\b(introduction|methods|results|discussion|conclusion)\b', '', texto, flags=re.IGNORECASE)
    texto = re.sub(r'[^\w\s.,]', '', texto)  # conserva solo letras, números, espacios y puntuación mínima
    texto = re.sub(r'\s+', ' ', texto).strip()
    return texto.lower()



In [9]:
# ==================== Preprocesamiento para BERT ====================
DetectorFactory.seed = 0

def contiene_ingles(texto):
    try:
        palabras = texto.lower().split()
        palabras_ingles = set(["the", "is", "are", "this", "with", "and", "from", "to", "of", "that"])
        return any(p in palabras for p in palabras_ingles)
    except:
        return False

def limpiar_bert(texto):
    if not texto or texto.strip() == '':
        return 'Texto vacío'

    texto_limpio = BeautifulSoup(texto, "html.parser").get_text()
    texto_limpio = texto_limpio.lower()

    texto_limpio = re.sub(
        r'\b(introduction|aims|objectives|methods|results|discussion|conclusion|background|purpose|scope)\b[:\s]*',
        '', texto_limpio, flags=re.IGNORECASE
    )

    texto_limpio = re.sub(
        r'\b(this study (aims|explores|presents)|we (present|explore|investigate)|the purpose of this (study|paper))\b',
        '', texto_limpio
    )

    texto_limpio = re.sub(r'[^a-z0-9\s.,-]', '', texto_limpio)
    texto_limpio = re.sub(r'\s+', ' ', texto_limpio).strip()
    return texto_limpio


In [16]:
# ==================== Cargar dataset ====================
file_path = r"C:\Users\Jhonr\Project_ODS_BERT\Data_total\Data_Inferencia_2018_2024_medicina_ingenieria.xlsx"
df_total = pd.read_excel(file_path)
df_total['Abstract_YAKE'] = df_total['Abstract'].apply(limpiar_yake)
df_total['Abstract_BERT'] = df_total['Abstract'].apply(lambda x: limpiar_bert(x) if contiene_ingles(x) else None)


In [17]:
# ==================== Inferencia YAKE ====================
from yake import KeywordExtractor

# Configurar dos extractores YAKE: unigrama y bigrama
extractor_n1 = KeywordExtractor(lan="en", n=1, top=10)
extractor_n2 = KeywordExtractor(lan="en", n=2, top=10)

ods_keywords = {
    "ODS 6": [
        "clean water", "drinking water", "water sanitation", "water access",
        "safe water", "water purification", "wastewater treatment", "sanitation services",
        "hygiene promotion", "water supply", "water scarcity", "potable water",
        "water", "sanitation", "hygiene", "wastewater"
    ],
    "ODS 7": [
        "renewable energy", "solar energy", "wind power", "energy efficiency",
        "electricity access", "clean energy", "energy grid", "sustainable energy",
        "off-grid systems", "green energy", "solar panels", "smart grid",
        "energy", "electricity", "solar", "wind"
    ],
    "ODS 13": [
        "climate change", "global warming", "carbon emissions", "climate adaptation",
        "emission reduction", "climate resilience", "greenhouse gases",
        "low carbon", "carbon footprint", "climate mitigation", "net zero",
        "environmental sustainability", "climate", "carbon", "emissions"
    ]
}

def clasificar_yake(texto):
    if not texto or texto.strip() == '':
        return None
    keywords_n1 = [kw[0].lower() for kw in extractor_n1.extract_keywords(texto)]
    keywords_n2 = [kw[0].lower() for kw in extractor_n2.extract_keywords(texto)]
    keywords = keywords_n1 + keywords_n2
    for ods, terms in ods_keywords.items():
        if any(term in keywords for term in terms):
            return ods
    return None

df_total['prediccion_yake'] = df_total['Abstract_YAKE'].apply(clasificar_yake)


In [18]:
# ==================== Cargar modelos BERT ====================
modelo_bert_9_otros_path = r"C:\\Users\\Jhonr\\Project_ODS_BERT\\Modelos\\BERT_9_OTROS_UNIFICADO"
modelo_bert_ods3_path = r"C:\\Users\\Jhonr\\Project_ODS_BERT\\Modelos\\BERT_ODS3_Medicina"

modelo_bert_9_otros = AutoModelForSequenceClassification.from_pretrained(modelo_bert_9_otros_path).to(device)
tokenizer_bert_9_otros = AutoTokenizer.from_pretrained(modelo_bert_9_otros_path)

modelo_bert_ods3 = AutoModelForSequenceClassification.from_pretrained(modelo_bert_ods3_path).to(device)
tokenizer_bert_ods3 = AutoTokenizer.from_pretrained(modelo_bert_ods3_path)

In [19]:
# ==================== Función para predecir con modelo BERT ====================
def inferir_con_modelo(texto, modelo, tokenizer):
    inputs = tokenizer(texto, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = modelo(**inputs).logits
    pred = torch.argmax(logits, dim=1).item()
    return modelo.config.id2label[pred]


In [20]:
# ==================== Clasificación final ====================
def clasificar_resumen(row):
    if row['prediccion_yake'] in ["ODS 6", "ODS 7", "ODS 13"]:
        return row['prediccion_yake']
    
    if row['Abstract_BERT']:
        pred_ods3 = inferir_con_modelo(row['Abstract_BERT'], modelo_bert_ods3, tokenizer_bert_ods3)
        if pred_ods3 == "ODS 3":
            return "ODS 3"
    
    if row['Abstract_BERT']:
        return inferir_con_modelo(row['Abstract_BERT'], modelo_bert_9_otros, tokenizer_bert_9_otros)
    
    return "Sin Clasificar"



In [21]:
# Aplicar la clasificación sobre el DataFrame
import time
inicio = time.time()
df_total['ODS_Predicho'] = df_total.apply(clasificar_resumen, axis=1)
fin = time.time()
print(f"Tiempo total de inferencia: {round((fin - inicio) / 60, 2)} minutos")


Tiempo total de inferencia: 7.07 minutos


In [22]:
#Visualizar la distribución final de etiquetas
print("\nDistribución de etiquetas predichas:")
print(df_total['ODS_Predicho'].value_counts())



Distribución de etiquetas predichas:
ODS_Predicho
ODS 3             15905
ODS 9              4377
OTROS              1692
ODS 7               710
ODS 6               344
ODS 13              197
Sin Clasificar        4
Name: count, dtype: int64


In [24]:
# ==================== Distribución de ODS por Área ====================
conteo_area_ods = df_total.groupby(["Area", "ODS_Predicho"]).size().unstack(fill_value=0)

# Mostrar la tabla
display(conteo_area_ods)


ODS_Predicho,ODS 13,ODS 3,ODS 6,ODS 7,ODS 9,OTROS,Sin Clasificar
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ingenieria,153,1101,264,676,4132,1212,1
Medicina,44,14804,80,34,245,480,3


In [25]:
import os

# Crear carpeta si no existe
output_folder = r"C:\Users\Jhonr\Project_ODS_BERT\Resultados_2018-2024"
os.makedirs(output_folder, exist_ok=True)

# Guardar archivo
output_path = os.path.join(output_folder, "Predicciones_2018_2024_Modelo_Final.xlsx")
df_total.to_excel(output_path, index=False)

print(f"Archivo guardado en: {output_path}")


Archivo guardado en: C:\Users\Jhonr\Project_ODS_BERT\Resultados_2018-2024\Predicciones_2018_2024_Modelo_Final.xlsx
