<a href="https://colab.research.google.com/github/juanfisicobr/Red-coocurrencia-terminos/blob/main/ClusterTem%C3%A1ticos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas nltk python-louvain matplotlib networkx



In [2]:
import pandas as pd
import networkx as nx
import re
from itertools import combinations
import community as community_louvain
import matplotlib.pyplot as plt


def preprocess_text(text, custom_stopwords=None):
    mapa_normalizacao = {
        'investigaciones': 'investigacion',
        'docentes': 'docente',
        'estudiantes': 'estudiante',
        'sociales': 'social',
        'educacionales': 'educacional',
        'nacionales': 'nacional',
        'regionales': 'regional',
        'locales': 'local',
        'institucionales': 'institucional',
        'profesionales': 'profesional',
        'populares': 'popular',
        'municipales': 'municipal',
        'políticas': 'política',
        'acciones': 'accion',
        'redes': 'red',
        'universidades': 'universidad'
    }
    stop_words = set(['de', 'a', 'o', 'que', 'y', 'e', 'el', 'la', 'en', 'un', 'una', 'para',
        'con', 'no', 'los', 'las', 'por', 'mas', 'más', 'se', 'su', 'sus',
        'como', 'pero', 'al', 'del', 'le', 'lo', 'me', 'mi', 'sin', 'son',
        'tambien', 'también', 'este', 'esta', 'estos', 'estas', 'ser', 'es'])
    if custom_stopwords:
        stop_words.update(custom_stopwords)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = text.split()
    normalized_tokens = [mapa_normalizacao.get(token, token) for token in tokens]
    filtered_tokens = [word for word in normalized_tokens if word not in stop_words and len(word) > 2]
    return filtered_tokens

def create_cooccurrence_matrix_from_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read()
    documents_raw = content.split('###')
    documents_raw = [doc.strip() for doc in documents_raw if doc.strip()]
    custom_stopwords = ['programa', 'educacion', 'pdi', 'art', 'articulo']
    processed_docs = [preprocess_text(doc, custom_stopwords) for doc in documents_raw]
    vocabulary = sorted(list(set(term for doc in processed_docs for term in doc)))
    M = pd.DataFrame(0, index=vocabulary, columns=vocabulary)
    for doc in processed_docs:
        unique_terms_in_doc = sorted(list(set(doc)))
        for term in unique_terms_in_doc:
            M.loc[term, term] += 1
        for term1, term2 in combinations(unique_terms_in_doc, 2):
            M.loc[term1, term2] += 1
            M.loc[term2, term1] += 1
    return M

def calcular_e_associar_metricas(G, M):
    partition = community_louvain.best_partition(G, weight='weight')
    pagerank = nx.pagerank(G, weight='weight')
    occurrences = {term: M.loc[term, term] for term in G.nodes()}
    clusters_ajustados = {node: cluster_id + 1 for node, cluster_id in partition.items()}
    nx.set_node_attributes(G, clusters_ajustados, 'cluster')
    nx.set_node_attributes(G, pagerank, 'pagerank')
    nx.set_node_attributes(G, occurrences, 'occurrences')
    print("Métricas (Cluster, PageRank, Ocorrencias) calculadas y asociadas a los nodos.")
    return G

def filtrar_rede(G, top_n, min_edge_weight_for_viz):
    if G.number_of_nodes() <= top_n:
        top_nodes = list(G.nodes())
    else:
        pagerank_dict = nx.get_node_attributes(G, 'pagerank')
        sorted_nodes = sorted(pagerank_dict, key=pagerank_dict.get, reverse=True)
        top_nodes = sorted_nodes[:top_n]
    G_sub = G.subgraph(top_nodes).copy()
    G_final = nx.Graph()
    G_final.add_nodes_from(G_sub.nodes(data=True))
    for u, v, data in G_sub.edges(data=True):
        if data['weight'] >= min_edge_weight_for_viz:
            G_final.add_edge(u, v, weight=data['weight'])
    G_final.remove_nodes_from(list(nx.isolates(G_final)))
    print(f"Red final (Top {top_n} nodos, Bordes >= {min_edge_weight_for_viz}): {G_final.number_of_nodes()} nodos, {G_final.number_of_edges()} bordes.")
    return G_final

## ----------------------------------------------------------------
## FUNCIÓN: Visualización de la Red
## ----------------------------------------------------------------
def visualizar_rede(G, title, output_filename):
    if G.number_of_nodes() == 0:
        print("La red está vacía. No es posible generar el gráfico.")
        return

    plt.figure(figsize=(16, 9))
    pos = nx.circular_layout(G)

    # Colores monocromáticos basados ​​en PageRank
    pagerank_values = [data.get('pagerank', 0) for _, data in G.nodes(data=True)]
    if pagerank_values:
        min_pr = min(pagerank_values)
        max_pr = max(pagerank_values)
        if max_pr == min_pr:
            norm_pr_values = [0.5] * len(pagerank_values)
        else:
            norm_pr_values = [(p - min_pr) / (max_pr - min_pr) for p in pagerank_values]
    else:
        norm_pr_values = []

    cmap = plt.cm.get_cmap('Blues_r')
    node_colors = [cmap(p) for p in norm_pr_values]

    # Tamaño del nodo
    min_size = 1500
    max_size = 16000
    if pagerank_values and min_pr == max_pr:
        node_sizes = [min_size] * G.number_of_nodes()
    elif pagerank_values:
        node_sizes = [min_size + ((p - min_pr) / (max_pr - min_pr)) * (max_size - min_size) for p in pagerank_values]
    else:
        node_sizes = []

    custom_labels = {
        node: f"{node.capitalize()}\n({data.get('occurrences', '?')})"
        for node, data in G.nodes(data=True)
    }


    nx.draw_networkx_nodes(
        G,
        pos,
        node_color=node_colors,
        node_size=node_sizes,
        alpha=0.7,
        edgecolors='black',
        linewidths=1.5
    )

    nx.draw_networkx_edges(G, pos, alpha=0.3, width=1.5, edge_color='grey')
    nx.draw_networkx_labels(G, pos, labels=custom_labels, font_size=12, font_color='black', font_weight='bold')

    plt.title(title, size=20)
    plt.tight_layout()
    plt.savefig(output_filename, dpi=300, bbox_inches='tight')
    plt.close()

    print(f"\nGráfico '{output_filename}' guardado con éxito!")

## ----------------------------------------------------------------
## BLOQUE DE EJECUCIÓN PRINCIPAL
## ----------------------------------------------------------------
if __name__ == "__main__":

    # --- Parámetros de Entrada ---
    FILE_PATH = 'extractos.txt'
    TOP_N_NODES = 20
    MIN_EDGE_WEIGHT_VIZ = 4 # Ajuste para controlar la densidad de la línea.

    GRAFICO_TITULO = "Análisis de Red Monocromática"
    GRAFICO_OUTPUT_FILE = "analisis_red_monocromatica.png"

    # --- Ejecución del flujo de trabajo ---
    matriz_M = create_cooccurrence_matrix_from_file(FILE_PATH)
    grafo_base = nx.from_pandas_adjacency(matriz_M)
    grafo_com_metricas = calcular_e_associar_metricas(grafo_base, matriz_M)
    grafo_final = filtrar_rede(grafo_com_metricas, top_n=TOP_N_NODES, min_edge_weight_for_viz=MIN_EDGE_WEIGHT_VIZ)
    visualizar_rede(grafo_final, GRAFICO_TITULO, GRAFICO_OUTPUT_FILE)

Métricas (Cluster, PageRank, Ocorrencias) calculadas y asociadas a los nodos.
Red final (Top 20 nodos, Bordes >= 4): 20 nodos, 74 bordes.


  cmap = plt.cm.get_cmap('Blues_r')



Gráfico 'analisis_red_monocromatica.png' guardado con éxito!
