## ***Modelado News-Clustering***

In [None]:
#!pip install umap-learn==0.5.6

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import plotly.express as px
from umap import UMAP
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, davies_bouldin_score

url_base = "https://raw.githubusercontent.com/lacamposm/Metodos-Estadisticos/main/data/"

In [2]:
def silhouette_scorer(estimator, X):
    """
    Calcula el Silhouette Score utilizando la métrica de coseno para evaluar la calidad de los clústeres.

    El Silhouette Score mide cómo de similar es un punto de datos a otros puntos de su mismo clúster en comparación
    con los puntos de otros clústeres. El valor oscila entre -1 y 1, donde un valor más alto indica que los puntos
    están mejor agrupados y más separados de otros clústeres.

    :param estimator: Un modelo de clustering que debe tener un método `predict` para obtener las etiquetas de los
    clústeres de las muestras de `X`.
    :param X: Un array-like o matriz de datos (n_samples, n_features) sobre los cuales se calcularán los clústeres.
    :return: El Silhouette Score para el conjunto de datos `X` y las etiquetas de los clústeres obtenidos del
    `estimator`. Se utiliza la métrica de coseno para calcular las distancias.
    """
    if not hasattr(estimator, "predict"):
        raise ValueError(f"El estimator: {estimator} no tiene metodo 'predict'")

    labels = estimator.predict(X)
    return silhouette_score(X, labels, metric="cosine")


def davies_bouldin_scorer(estimator, X):
    """
    Calcula el índice de Davies-Bouldin para evaluar la calidad de los clústeres.

    El índice de Davies-Bouldin mide la media del cociente entre las distancias intra-clúster e inter-clúster para cada
    clúster. Un valor más bajo del índice indica una mejor formación de los clústeres, ya que minimiza la dispersión
    dentro de los clústeres y maximiza la separación entre ellos.

    :param estimator: Un modelo de clustering que debe tener un método `predict` para obtener las etiquetas de los
    clústeres de las muestras de `X`.
    :param X: Un array-like o matriz de datos (n_samples, n_features) sobre los cuales se calcularán los clústeres.
    :return: El valor negativo del índice de Davies-Bouldin para el conjunto de datos `X` y las etiquetas de los
    clústeres obtenidos del `estimator`. Esto invierte la métrica para que pueda maximizarse en las búsquedas
    de hiperparámetros.
    """
    if not hasattr(estimator, "predict"):
        raise ValueError(f"El estimator: {estimator} no tiene metodo 'predict'")

    labels = estimator.predict(X)
    score = -davies_bouldin_score(X, labels)
    return score

In [3]:
def transform_dict_best_model(input_dict):
    """
    Transforma un diccionario de parámetros de un modelo en un nuevo formato más estructurado.

    Esta función toma un diccionario que contiene información sobre un modelo, incluyendo un preprocesador, un método
    de reducción de dimensiones y un agrupador (clusterer), junto con sus parámetros específicos. La función reorganiza
    esta información en un formato más accesible y fácil de usar.

    :param input_dict: Un diccionario que contiene los parámetros del modelo. Debe incluir las claves "preprocessor",
    "dim_reduction", y "clusterer", así como pares clave-valor para los parámetros del agrupador y la reducción de
    dimensiones que siguen el formato "clusterer__<param_name>" y "dim_reduction__<param_name>".
    :return: Un nuevo diccionario estructurado que contiene el preprocesador, la reducción de dimensiones, el agrupador,
    y los parámetros asociados a cada uno. La clave de los parámetros de agrupador y    de reducción de dimensiones
    se simplifica eliminando el prefijo correspondiente.
    """
    return {
        "preprocessor": input_dict["preprocessor"],
        "dim_reduction": input_dict["dim_reduction"],
        "clusterer": input_dict["clusterer"],
        "clusterer_params": {
            k.split("__")[-1]: v for k, v in input_dict.items()
            if k.startswith("clusterer__")
        },
        "dim_reduction_params": {
            k.split("__")[-1]: v for k, v in input_dict.items()
            if k.startswith("dim_reduction__")
        }
    }
    

def tsne_plot_2d(df, estimator=None, scorer=None, metric=None):
    """
    Genera un gráfico 2D utilizando t-SNE para visualizar embeddings en un espacio reducido.

    Esta función aplica un preprocesador a un DataFrame, seguido de una reducción de dimensionalidad mediante t-SNE.
    Se pueden visualizar los embeddings resultantes en un gráfico, con la opción de agregar etiquetas de clúster para
    distinguir diferentes grupos en los datos.

    :param df: DataFrame que contiene los datos a visualizar. Cada fila representa una observación y cada columna
    representa una característica o variable.
    :param estimator:
    :param scorer:
    :param metric
    """
    if estimator is not None:
        params_estimator = transform_dict_best_model(estimator.get_params())
        preprocessor = params_estimator.get("preprocessor")
        dim_reduction = params_estimator.get("dim_reduction")
        clusterer = params_estimator.get("clusterer")
        best_score = scorer(estimator, df)

        if isinstance(dim_reduction, PCA):
            pipeline = Pipeline([
                ("preprocessor", preprocessor),
                ("dim_reduction", dim_reduction),
            ])

            normalized_embeddings = pipeline.fit_transform(df)

        else:
            normalized_embeddings = preprocessor.fit_transform(df)

        tsne = TSNE(n_components=2, random_state=42, metric=metric)
        df_plot = pd.DataFrame(tsne.fit_transform(normalized_embeddings), columns=["tSNE1", "tSNE2"])
        df_plot["cluster"] = estimator.predict(df) + 1
        df_plot = df_plot.sort_values(by=["cluster"])
        df_plot["cluster"] = df_plot["cluster"].astype("string")
        title = (
            f"""<b>Clustering: News-Summary-Embeddings in Low dimension with t-SNE</b><br>"""
            f"""<span style='font-size: 11px;'>Scaler: {preprocessor.__class__.__name__}, """
            f"""Dim-Reduction: {dim_reduction.__class__.__name__}, Estimator: {clusterer.__class__.__name__}, Metric """
            f"""Plot: {metric}, Scorer Metric: {scorer.__name__}={best_score:.3f}</span>"""
        )
        color = "cluster"

    else:
        df_scaled = StandardScaler().fit_transform(df)
        tsne = TSNE(n_components=2, random_state=42, metric=metric)
        df_plot = pd.DataFrame(tsne.fit_transform(df_scaled), columns=["tSNE1", "tSNE2"])
        title = (
            f"""<b>News-Summary-Embeddings in Low dimension with t-SNE</b><br>"""
            f"""<span style='font-size: 10px;'>Scaler: {StandardScaler().__class__.__name__}"""
        )
        color = None

    (
        px.scatter(
            df_plot, x="tSNE1", y="tSNE2", color=color,
            title=title,
            opacity=0.8,
            color_discrete_sequence=px.colors.qualitative.Dark24,
            template="plotly_white"
        )
        .update_traces(marker=dict(size=3))
        .show()
    )
    

def umap_plot_2d(df, estimator=None, scorer=None, metric=None):
    """
    Función que genera un gráfico en 2D utilizando UMAP y muestra los clústeres.

    :param df: pd.DataFrame con los embeddings de los resumenes de las noticias).
    :param estimator:
    :param scorer:
    :param metric
    """
    if estimator is not None:
        params_estimator = transform_dict_best_model(estimator.get_params())
        preprocessor = params_estimator.get("preprocessor")
        dim_reduction = params_estimator.get("dim_reduction")
        clusterer = params_estimator.get("clusterer")
        best_score = scorer(estimator, df)

        if isinstance(dim_reduction, UMAP):
            df_scaled = preprocessor.fit_transform(df)
            dim_reduction.n_components = 2
            df_plot = pd.DataFrame(dim_reduction.fit_transform(df_scaled), columns=["UMAP1", "UMAP2"])
        else:
            pipeline = Pipeline([
                ("preprocessor", preprocessor),
                ("dim_reduction", dim_reduction),
            ])
            df_scaled = pipeline.fit_transform(df)
            reducer = UMAP(n_components=2, random_state=42, metric=metric)
            df_plot = pd.DataFrame(reducer.fit_transform(df_scaled), columns=["UMAP1", "UMAP2"])

        df_plot["cluster"] = estimator.predict(df) + 1
        df_plot = df_plot.sort_values(by=["cluster"])
        df_plot["cluster"] = df_plot["cluster"].astype("string")
        color = "cluster"
        title = (
            f"""<b>Clustering: News-Summary-Embeddings in Low dimension with UMAP</b><br>"""
            f"""<span style='font-size: 11px;'>Scaler: {preprocessor.__class__.__name__}, """
            f"""Dim-Reduction: {dim_reduction.__class__.__name__}, Estimator: {clusterer.__class__.__name__}, Metric:"""
            f""" {metric}, Scorer Metric: {scorer.__name__}={best_score:.3f}</span>"""

        )

    else:
        df_scaled = StandardScaler().fit_transform(df)
        reducer = UMAP(n_components=2, random_state=42)
        df_plot = pd.DataFrame(reducer.fit_transform(df_scaled), columns=["UMAP1", "UMAP2"])
        title = (
            f"""<b>News-Summary-Embeddings in Low dimension with UMAP</b><br>"""
            f"""<span style='font-size: 10px;'>Scaler: {StandardScaler().__class__.__name__}"""
        )
        color = None

    (
        px.scatter(
            df_plot, x="UMAP1", y="UMAP2", color=color,
            title=title,
            opacity=0.8,
            color_discrete_sequence=px.colors.qualitative.Dark24,
            template="plotly_white"
        )
        .update_traces(marker=dict(size=3))
        .show()
    )

In [4]:
n_iter = 10                                     # Número de ajustes en RandomizedSearchCV.
scorers = ["silhouette", "davies-bouldin"]      # Puedes seleccionar diferentes metrica de adecuación de clusters
random_state = 42                               # Semilla aletoria de RandomizedSearchCV para replicar resultados.


scoring_dict = {
    "silhouette": silhouette_scorer,
    "davies-bouldin": davies_bouldin_scorer,
}


list_scalers = [StandardScaler(), MinMaxScaler(), RobustScaler()]

pca_n_components = [num for num in range(5, 80)]
umap_metrics = ["cosine", "correlation"]
umap_n_neighbors = [5, 10, 15]
umap_min_dist = [0.1, 0.3, 0.5]
umap_n_components = [2, 3, 4]

gauss_mix_n_components = [num for num in range(5, 13)]
gauss_mix_covariance_type = ["full", "diag"]
kmeans_n_clusters = [num for num in range(5, 21)]

param_grid = [
    {
        "preprocessor": list_scalers,
        "dim_reduction": [PCA()],
        "dim_reduction__n_components": pca_n_components,
        "clusterer": [KMeans()],
        "clusterer__n_clusters": kmeans_n_clusters,
    },
    {
        "preprocessor": list_scalers,
        "dim_reduction": [PCA()],
        "dim_reduction__n_components": pca_n_components,
        "clusterer": [GaussianMixture()],
        "clusterer__n_components": gauss_mix_n_components,
        "clusterer__covariance_type": gauss_mix_covariance_type
    },
    {
        "preprocessor": list_scalers,
        "dim_reduction": [UMAP()],
        "dim_reduction__n_neighbors": umap_n_neighbors,
        "dim_reduction__min_dist": umap_min_dist,
        "dim_reduction__n_components": umap_n_components,
        "dim_reduction__metric": umap_metrics,
        "clusterer": [KMeans()],
        "clusterer__n_clusters": kmeans_n_clusters,
    },
    {
        "preprocessor": list_scalers,
        "dim_reduction": [UMAP()],
        "dim_reduction__n_neighbors": umap_n_neighbors,
        "dim_reduction__min_dist": umap_min_dist,
        "dim_reduction__n_components": umap_n_components,
        "dim_reduction__metric": umap_metrics,
        "clusterer": [GaussianMixture()],
        "clusterer__n_components": gauss_mix_n_components,
        "clusterer__covariance_type": gauss_mix_covariance_type
    }
]


pipeline = Pipeline([
        ("preprocessor", "passthrough"),
        ("dim_reduction", "passthrough"),
        ("clusterer", "passthrough")
    ])

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    scoring=scoring_dict,
    cv=2,
    n_jobs=-1,
    verbose=3,
    n_iter=n_iter,
    refit="silhouette",
    random_state=random_state
)

In [5]:
df_embed = pd.read_parquet(url_base + "data/mxbai-embed-large_summary_news_el_tiempo.parquet")
df_embed

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
url_page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://www.eltiempo.com/mundo/latinoamerica/evo-morales-amenaza-con-bloquear-bolivia-si-la-fiscalia-lo-captura-3389757,-0.020917,-0.006987,-0.026591,0.017555,0.032127,-0.046068,-0.048087,0.004656,0.005598,0.049441,...,0.028381,0.019583,0.007692,0.002656,0.050690,0.041930,0.001792,-0.013605,-0.002610,-0.016743
https://www.eltiempo.com/bogota/vehiculo-se-incendia-en-medio-de-la-carretera-en-fusagasuga-cundinamarca-bomberos-atienden-la-emergencia-3389756,-0.031880,0.027921,-0.016712,0.032918,0.024373,-0.010680,-0.058477,-0.027742,0.044773,0.041911,...,0.048250,0.040552,-0.037708,0.000493,0.010013,0.003768,0.026349,0.010262,-0.003502,0.014081
https://www.eltiempo.com/mundo/eeuu-y-canada/esta-es-la-edad-en-la-que-un-adulto-mayor-debe-dejar-de-conducir-harvard-3389502,-0.021267,0.007155,0.034475,0.030611,-0.008942,-0.016378,-0.011049,-0.016557,0.039848,0.030022,...,0.073989,0.016037,-0.018351,-0.029923,0.037829,-0.002450,-0.024424,0.009189,0.001148,-0.009378
https://www.eltiempo.com/mundo/europa/morire-en-prision-escribio-en-sus-memorias-el-opositor-ruso-alexei-navalni-3389755,-0.001724,-0.011231,0.006536,0.007330,0.013462,-0.022181,-0.013968,0.002475,0.010086,0.029852,...,0.043627,0.040211,-0.020006,0.009449,0.054445,0.034426,-0.022466,-0.010860,-0.025343,-0.013328
https://www.eltiempo.com/colombia/otras-ciudades/alcaldia-de-villa-de-leyva-en-boyaca-entrega-balance-tras-el-vendaval-de-ese-sabado-hay-12-personas-heridas-3389752,-0.015240,0.014905,0.014017,0.032451,0.006244,-0.013349,-0.052630,0.047903,0.035785,0.030179,...,-0.000029,0.047785,-0.012244,-0.007176,0.010913,0.012925,0.016821,0.001564,-0.004066,0.014183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://www.eltiempo.com/mas-contenido/el-llamado-desde-el-sur-del-meta-3385783,-0.015249,0.032793,-0.022702,0.048635,-0.002823,-0.039297,-0.053139,0.034664,0.032758,0.025721,...,0.027468,0.045833,-0.016872,-0.021523,0.025576,0.014999,-0.008812,-0.021739,-0.010349,0.009338
https://www.eltiempo.com/mundo/medio-oriente/asi-localizo-israel-al-lider-de-hezbola-los-detalles-del-operativo-de-inteligencia-para-efectuar-el-bombardeo-aereo-3385781,-0.013496,-0.014557,-0.018691,0.047176,0.018366,-0.011003,-0.093697,0.026669,-0.002839,-0.000249,...,0.032180,-0.000164,-0.033964,-0.035739,0.008483,0.055563,-0.048210,-0.003561,0.001201,-0.018963
https://www.eltiempo.com/mas-contenido/del-crudo-al-turismo-3385901,-0.024780,0.041507,0.002410,0.001920,-0.007972,0.006983,-0.026574,0.025248,0.020038,0.037295,...,0.042184,0.043891,-0.024159,-0.011424,0.054588,-0.006380,0.002650,-0.021769,-0.050148,0.003720
https://www.eltiempo.com/mas-contenido/el-imponente-tesoro-verde-3385938,-0.013511,0.009672,-0.010728,0.041266,0.003864,0.014097,-0.032700,0.034680,0.065962,0.033704,...,0.052577,0.008840,0.002960,0.019802,0.011811,0.018858,-0.007259,-0.019385,-0.031452,0.014297


In [6]:
tsne_plot_2d(df_embed, metric="cosine")
umap_plot_2d(df_embed, metric="cosine")

In [7]:
best_model = random_search.fit(df_embed)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[CV 1/2] END clusterer=GaussianMixture(), clusterer__covariance_type=full, clusterer__n_components=12, dim_reduction=PCA(), dim_reduction__n_components=10, preprocessor=MinMaxScaler(); davies-bouldin: (test=-3.612) silhouette: (test=0.101) total time=   3.5s
[CV 2/2] END clusterer=GaussianMixture(), clusterer__covariance_type=full, clusterer__n_components=12, dim_reduction=PCA(), dim_reduction__n_components=10, preprocessor=MinMaxScaler(); davies-bouldin: (test=-3.754) silhouette: (test=0.067) total time=   4.1s
[CV 2/2] END clusterer=GaussianMixture(), clusterer__covariance_type=diag, clusterer__n_components=6, dim_reduction=PCA(), dim_reduction__n_components=41, preprocessor=MinMaxScaler(); davies-bouldin: (test=-2.578) silhouette: (test=0.015) total time=   3.8s
[CV 1/2] END clusterer=KMeans(), clusterer__n_clusters=7, dim_reduction=PCA(), dim_reduction__n_components=10, preprocessor=MinMaxScaler(); davies-bouldin: (test=-3.536) silhouette: (test=0.096) total time=   2.6s
[CV 1/2] E

In [8]:
best_model.best_estimator_

In [9]:
pd.DataFrame(best_model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessor,param_dim_reduction__n_neighbors,param_dim_reduction__n_components,param_dim_reduction__min_dist,param_dim_reduction__metric,param_dim_reduction,...,split0_test_silhouette,split1_test_silhouette,mean_test_silhouette,std_test_silhouette,rank_test_silhouette,split0_test_davies-bouldin,split1_test_davies-bouldin,mean_test_davies-bouldin,std_test_davies-bouldin,rank_test_davies-bouldin
0,51.271346,0.385825,83.826072,1.606028,MinMaxScaler(),15.0,3,0.5,cosine,UMAP(),...,0.083918,0.064168,0.074043,0.009875,5,-3.842623,-4.406712,-4.124667,0.282045,9
1,3.23822,0.020554,1.920007,0.366278,RobustScaler(),,66,,,PCA(),...,0.108217,0.083052,0.095634,0.012583,1,-3.406415,-3.508425,-3.45742,0.051005,3
2,6.458471,0.46638,1.315097,0.194458,RobustScaler(),,76,,,PCA(),...,0.063357,0.04361,0.053483,0.009873,8,-3.680987,-3.84399,-3.762489,0.081502,8
3,2.099224,0.281909,1.69574,0.037647,MinMaxScaler(),,10,,,PCA(),...,0.100876,0.066659,0.083768,0.017109,3,-3.612422,-3.754184,-3.683303,0.070881,7
4,50.825454,0.182033,84.383456,0.54041,StandardScaler(),10.0,3,0.5,cosine,UMAP(),...,0.090422,0.059237,0.074829,0.015593,4,-3.415275,-3.700893,-3.558084,0.142809,6
5,50.813658,0.393914,82.270513,0.918374,MinMaxScaler(),15.0,2,0.3,cosine,UMAP(),...,0.07542,0.06849,0.071955,0.003465,6,-4.075437,-4.213783,-4.14461,0.069173,10
6,2.208401,0.224756,1.821031,0.052197,MinMaxScaler(),,41,,,PCA(),...,0.067496,0.015345,0.04142,0.026075,9,-3.503485,-2.578498,-3.040991,0.462494,1
7,2.459212,0.28047,1.900168,0.350558,MinMaxScaler(),,68,,,PCA(),...,0.05803,-0.031904,0.013063,0.044967,10,-3.612718,-3.139329,-3.376024,0.236694,2
8,1.343566,0.007296,1.300343,0.023179,MinMaxScaler(),,10,,,PCA(),...,0.095963,0.087849,0.091906,0.004057,2,-3.535716,-3.483836,-3.509776,0.02594,5
9,3.110874,0.014511,1.114237,0.07493,MinMaxScaler(),,55,,,PCA(),...,0.07688,0.064583,0.070732,0.006148,7,-3.50199,-3.418062,-3.460026,0.041964,4


In [10]:
labels = best_model.predict(df_embed)
labels

array([0, 6, 3, ..., 1, 1, 2], dtype=int32)

In [11]:
tsne_plot_2d(df_embed, best_model.best_estimator_, silhouette_scorer, "cosine")
umap_plot_2d(df_embed, best_model.best_estimator_, silhouette_scorer, "cosine")

In [12]:
pd.DataFrame(best_model.predict(df_embed) + 1, index=df_embed.index, columns=["cluster"])

Unnamed: 0_level_0,cluster
url_page,Unnamed: 1_level_1
https://www.eltiempo.com/mundo/latinoamerica/evo-morales-amenaza-con-bloquear-bolivia-si-la-fiscalia-lo-captura-3389757,1
https://www.eltiempo.com/bogota/vehiculo-se-incendia-en-medio-de-la-carretera-en-fusagasuga-cundinamarca-bomberos-atienden-la-emergencia-3389756,7
https://www.eltiempo.com/mundo/eeuu-y-canada/esta-es-la-edad-en-la-que-un-adulto-mayor-debe-dejar-de-conducir-harvard-3389502,4
https://www.eltiempo.com/mundo/europa/morire-en-prision-escribio-en-sus-memorias-el-opositor-ruso-alexei-navalni-3389755,3
https://www.eltiempo.com/colombia/otras-ciudades/alcaldia-de-villa-de-leyva-en-boyaca-entrega-balance-tras-el-vendaval-de-ese-sabado-hay-12-personas-heridas-3389752,7
...,...
https://www.eltiempo.com/mas-contenido/el-llamado-desde-el-sur-del-meta-3385783,2
https://www.eltiempo.com/mundo/medio-oriente/asi-localizo-israel-al-lider-de-hezbola-los-detalles-del-operativo-de-inteligencia-para-efectuar-el-bombardeo-aereo-3385781,6
https://www.eltiempo.com/mas-contenido/del-crudo-al-turismo-3385901,2
https://www.eltiempo.com/mas-contenido/el-imponente-tesoro-verde-3385938,2


### ***Caracterizacion de los clusters***

In [13]:
import json
import pandas as pd

with open(url_base + "octubre_news_summary.json", "r") as archivo:
    datos_json = json.load(archivo)


df_predict = pd.DataFrame(best_model.predict(df_embed) +1, index=df_embed.index, columns=["cluster"])
df_summary = pd.DataFrame(data=datos_json.values(), index=datos_json.keys(), columns=["summary"])
df_review = df_predict.merge(df_summary, left_index=True, right_index=True)
df_review

Unnamed: 0,cluster,summary
https://www.eltiempo.com/mundo/latinoamerica/evo-morales-amenaza-con-bloquear-bolivia-si-la-fiscalia-lo-captura-3389757,1,"Evo Morales, expresidente de Bolivia, amenaza ..."
https://www.eltiempo.com/bogota/vehiculo-se-incendia-en-medio-de-la-carretera-en-fusagasuga-cundinamarca-bomberos-atienden-la-emergencia-3389756,7,El Cuerpo de Bomberos de Cundinamarca atendió ...
https://www.eltiempo.com/mundo/eeuu-y-canada/esta-es-la-edad-en-la-que-un-adulto-mayor-debe-dejar-de-conducir-harvard-3389502,4,Los accidentes automovilísticos están en aumen...
https://www.eltiempo.com/mundo/europa/morire-en-prision-escribio-en-sus-memorias-el-opositor-ruso-alexei-navalni-3389755,3,"Alexei Navalni, opositor ruso al presidente Vl..."
https://www.eltiempo.com/colombia/otras-ciudades/alcaldia-de-villa-de-leyva-en-boyaca-entrega-balance-tras-el-vendaval-de-ese-sabado-hay-12-personas-heridas-3389752,7,Un fuerte vendaval sorprendió a los habitantes...
...,...,...
https://www.eltiempo.com/mas-contenido/el-llamado-desde-el-sur-del-meta-3385783,2,El acuerdo de paz con las Farc ha permitido la...
https://www.eltiempo.com/mundo/medio-oriente/asi-localizo-israel-al-lider-de-hezbola-los-detalles-del-operativo-de-inteligencia-para-efectuar-el-bombardeo-aereo-3385781,6,"El viernes, un bombardeo aéreo israelí eliminó..."
https://www.eltiempo.com/mas-contenido/del-crudo-al-turismo-3385901,2,El tema central es la transición energética en...
https://www.eltiempo.com/mas-contenido/el-imponente-tesoro-verde-3385938,2,"El Meta enfrenta la amenaza de deforestación, ..."


In [14]:
def get_text_sample_by_cluster(label_cluster, size_sample):
    """
    """
    df = df_review.query("cluster == @label_cluster").sample(size_sample)
    text_to_llm = df["summary"].to_list()
    text_to_llm = text_to_llm = "\n".join([f"Noticia {i+1}\n{news}\n" for i, news in enumerate(text_to_llm)])
    print(text_to_llm)
    return text_to_llm

print(get_text_sample_by_cluster(2, 30))

Noticia 1
El número de internautas en el mundo ha continuado creciendo desde 2005, con un incremento significativo durante la pandemia, especialmente en Colombia donde se espera alcanzar 44 millones en cinco años. Las redes sociales como Facebook y plataformas locales como Chat Bogota han facilitado la interacción entre ciudadanos de diferentes países. En América Latina, Brasil y México son líderes en valorar el socializar libremente. Moderadores en chats online garantizan una experiencia positiva, revisando y eliminando contenido inapropiado. Salas específicas para ciudades como Bogotá permiten a los usuarios conectarse con personas de su mismo país y ciudad, potenciando la posibilidad de encuentros presenciales.

Noticia 2
El jueves 3 de octubre, la Imprensa Nacional Casa da Moeda de Portugal visitó las instalaciones de la Imprenta Nacional en Colombia tras firmar un memorando de entendimiento para cooperar en la producción y personalización de pasaportes. El acuerdo permitirá a Port

In [15]:
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)

def get_topic_in_cluster(news: str, model_name: str = "qwen2.5:7b") -> str:

    llm_ollama = ChatOllama(
        model=model_name,
        temperature=0.1,
        num_predict=4096,
        num_ctx=16000,
        keep_alive=0
    )

    prompt_summarization = ChatPromptTemplate(
        [
            SystemMessagePromptTemplate.from_template(
                """Eres un asistente especializado en análisis de noticias y clustering semántico. Tu tarea es ayudar a identificar temas, """
                """palabras clave y subtemas dentro de grupos de resúmenes de noticias. Analizas los patrones comunes en los eventos, """
                """lugares, personas y palabras clave para proporcionar una descripción clara y precisa del tema principal de cada grupo """
                """de noticias. Además, eres capaz de detectar subtemas o enfoques recurrentes que puedan enriquecer el análisis."""
            ),
            HumanMessagePromptTemplate.from_template(
                """Los siguientes son resúmenes de noticias que han sido agrupados en un clúster debido a su similitud semántica. """
                """Tu tarea es analizar estos resúmenes y proporcionar una caracterización general del tema principal del clúster. """
                """Identifica patrones comunes en los eventos, lugares, personas y palabras clave mencionadas, y determina un título o """
                """categoría representativa que capture la esencia del clúster."""
                """Además, destaca si hay algún subtema relevante o recurrente que ayude a entender mejor el enfoque de las noticias en """
                """este grupo. Asegúrate de que la descripción sea clara y precisa para que refleje el contenido general de las noticias."""
                """Resúmenes de noticias en el clúster: {news}"""
                """Salida esperada:
                    Tema principal: Una breve descripción del tema general del clúster.
                    Palabras clave: Lista de palabras clave relevantes.
                    Subtemas (si los hay): Cualquier subtema adicional que aparezca en varios resúmenes.
                    Título sugerido: Un título breve que represente el clúster."""
                )
        ]
    )

    llm_chain = prompt_summarization | llm_ollama | StrOutputParser()

    return llm_chain.invoke({"news": news})

In [16]:
label_cluster = 1
print(f"Cluster con label: {label_cluster}\nNoticias de la muestra:\n")
news_summary = get_text_sample_by_cluster(label_cluster, 5)
print(get_topic_in_cluster(news_summary))

Cluster con label: 1
Noticias de la muestra:

Noticia 1
El ministro del Interior, Juan Fernando Cristo, presentó oficialmente el "gran acuerdo nacional" con cinco ejes para promover la reconciliación política en Colombia. Este documento incluye garantizar que no se buscará la reelección y promover el respeto a las reglas democráticas. Entre los puntos clave está erradicar la violencia en la política, respaldar las reglas electorales existentes, fortalecer el acuerdo de paz con las Farc, impulsar un crecimiento económico equitativo y trabajar en reformas sociales. El documento busca establecer espacios de discusión entre partidos, gremios y sociedad civil, aunque la adhesión de estos sectores será crucial para su éxito a largo plazo.

Noticia 2
El alto porcentaje de servidores públicos en provisionalidad en el Gobierno central es un problema crítico, según expertos. Este fenómeno se refleja en entidades como la Registraduría Nacional del Estado Civil y el Ministerio de Defensa, donde má