In [None]:
# Desinstala o pacote python-louvain (e seu nome de importação 'community')
!pip uninstall python-louvain -y
# Tenta desinstalar o módulo 'community' que pode estar causando o conflito
!pip uninstall community -y
# Remove o cache de importação
import sys
if 'community' in sys.modules:
    del sys.modules['community']

In [None]:
# Instala a dependência Louvain com o gerenciador de sistema
!apt-get install liblapack-dev
!pip install python-louvain

In [None]:
!pip install wikipedia requests beautifulsoup4 networkx nxviz python-louvain pandas matplotlib seaborn tqdm lxml


In [5]:
# 1. Importações e configurações
import time
import wikipedia
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import deque
from operator import itemgetter
# import community as community_louvain


In [None]:
# Trabalho Final

# parâmetros gerais
SEEDS = [
    "Python (programming language)",
    "Football",
    "Global warmin",
    "Energy crisis",
    "Eclipse"
]
MAX_DEPTH = 2            # altura < 3
MAX_LINKS_PER_PAGE = 100  # heurística para evitar explosão
REQUEST_SLEEP = 0.4
STOPS = (
    "International Standard Serial Number",
    "International Standard Book Number",
    "National Diet Library",
    "International Standard Name Identifier",
    "International Standard Book Number (Identifier)",
    "Pubmed Identifier",
    "Pubmed Central",
    "Digital Object Identifier",
    "Arxiv",
    "Proc Natl Acad Sci Usa",
    "Bibcode",
    "Library Of Congress Control Number",
    "Jstor",
    "Doi (Identifier)",
    "Isbn (Identifier)",
    "Pmid (Identifier)",
    "Arxiv (Identifier)",
    "Bibcode (Identifier)"
)

# %%
# 2. Funções de coleta

def crawl_seed(seed, max_depth=MAX_DEPTH, max_links_per_page=MAX_LINKS_PER_PAGE, sleep=REQUEST_SLEEP):
    """Roda BFS até max_depth (0=seed) para a seed fornecida.
    Retorna um DiGraph (edges: page -> linked_page).
    """
    todo_lst = [(0, seed.title())]
    todo_set = {seed.title()}
    done_set = set()
    g = nx.DiGraph()

    while todo_lst:
        layer, page = todo_lst.pop(0)
        if layer > max_depth - 1:
            break
        if page in done_set:
            continue

        try:
            wiki = wikipedia.page(page)
        except Exception as e:
            # falha ao carregar => pular
            done_set.add(page)
            continue

        done_set.add(page)
        count = 0
        for link in wiki.links:
            link = link.title()
            if (link not in STOPS) and (not link.startswith("List Of")) and (":" not in link):
                g.add_edge(page, link)
                if (link not in todo_set) and (link not in done_set) and (layer + 1 <= max_depth - 1):
                    todo_lst.append((layer + 1, link))
                    todo_set.add(link)
                count += 1
                if count >= max_links_per_page:
                    break
        time.sleep(sleep)

    return g

# %%
# 3. Coletar para todas as seeds e mesclar
start_time_coleta = time.time()

graphs = []
for seed in SEEDS:
    print(f"Coletando seed: {seed}")
    g = crawl_seed(seed)
    print(f"Seed {seed}: nós={len(g.nodes())}, arestas={g.number_of_edges()}")
    graphs.append(g)

G = nx.DiGraph()
for g in graphs:
    G.add_nodes_from(g.nodes())
    G.add_edges_from(g.edges())

end_time_coleta = time.time()
coleta_duration = end_time_coleta - start_time_coleta

print(f"Grafo mesclado: nós={len(G.nodes())}, arestas={G.number_of_edges()}")


# 4. Limpeza
G.remove_edges_from(nx.selfloop_edges(G))

# Identifica e contrai duplicatas simples ('network' e 'networks')
duplicates = [(node, node+"s") for node in list(G) if node+"s" in G]
for dup in duplicates:
    # nx.contracted_nodes cria o atributo 'contraction' no nó, que o GraphML não suporta.
    G = nx.contracted_nodes(G, *dup, self_loops=False)

# Identifica e contrai duplicatas com hífen
duplicates = [(x, y) for x, y in [(node, node.replace("-", " ")) for node in list(G)] if x != y and y in G]
for dup in duplicates:
    G = nx.contracted_nodes(G, *dup, self_loops=False)

# Remove o atributo 'contraction' de todos os nós que foram mesclados.
nodes_to_clean = list(G.nodes())
for node in nodes_to_clean:
    if 'contraction' in G.nodes[node]:
        del G.nodes[node]['contraction']

print(f"Após limpeza: nós={len(G.nodes())}, arestas={G.number_of_edges()}")

# %%
start_time_metricas = time.time()
# 6. Cálculo de métricas
G_und = nx.Graph()
G_und.add_nodes_from(G.nodes(data=True))
G_und.add_edges_from(G.edges())

print("Calculando métricas... isso pode demorar dependendo do tamanho do grafo")

# degree (int)
deg_dict = dict(G_und.degree())
nx.set_node_attributes(G_und, deg_dict, "degree")

# centralidades
deg_c = nx.degree_centrality(G_und)
clos = nx.closeness_centrality(G_und)
betw = nx.betweenness_centrality(G_und, normalized=True)
try:
    eig = nx.eigenvector_centrality(G_und, max_iter=200)
except Exception as e:
    print("Eigenvector não convergiu:", e)
    eig = {n: 0.0 for n in G_und.nodes()}

nx.set_node_attributes(G_und, deg_c, "degree_centrality")
nx.set_node_attributes(G_und, clos, "closeness")
nx.set_node_attributes(G_und, betw, "betweenness")
nx.set_node_attributes(G_und, eig, "eigenvector")

# core_number
core_n = nx.core_number(G_und)
nx.set_node_attributes(G_und, core_n, "core_number")

# comunidades (Louvain)
import community
partition = community.best_partition(G_und)
nx.set_node_attributes(G_und, partition, "community")

# exportar métricas
rows = []
for n in G_und.nodes():
    rows.append({
        "node": n,
        "degree": G_und.nodes[n].get("degree", 0),
        "degree_centrality": G_und.nodes[n].get("degree_centrality", 0),
        "closeness": G_und.nodes[n].get("closeness", 0),
        "betweenness": G_und.nodes[n].get("betweenness", 0),
        "eigenvector": G_und.nodes[n].get("eigenvector", 0),
        "core_number": G_und.nodes[n].get("core_number", 0),
        "community": G_und.nodes[n].get("community", -1)
    })

df_metrics = pd.DataFrame(rows)
df_metrics.to_csv("metrics_wiki_tech.csv", index=False)

end_time_metricas = time.time()
metricas_duration = end_time_metricas - start_time_metricas
print("CSV de métricas salvo: metrics_wiki_tech.csv")

# %%
# 7. Export para Gephi (GEXF) — inclui atributos de nó
nx.write_gexf(G_und, "wiki_tech_merged.gexf")
print("GEXF salvo: wiki_tech_merged.gexf")

# %%
# 8. Plots básicos
import matplotlib as mpl

pos = nx.spring_layout(G_und, seed=42, k=0.15)
DEGREE_THRESHOLD = 40

def plot_metric(metric, fname=None):
    fig, ax = plt.subplots(1,1,figsize=(12,10)) # Tamanho da figura
    vals = [G_und.nodes[n].get(metric, 0) for n in G_und.nodes()]

    # 1. CRIA O DICIONÁRIO DE RÓTULOS FILTRADO
    labels_to_show = {
        node: node
        for node in G_und.nodes()
        if G_und.nodes[node].get("degree", 0) >= DEGREE_THRESHOLD # Só inclui nós acima do limite
    }

    nx.draw_networkx_edges(G_und, pos=pos, alpha=0.2, ax=ax)

    # Desenha os nós (o tamanho já é proporcional ao degree)
    nx.draw_networkx_nodes(G_und,
                           pos=pos,
                           node_color=vals,
                           cmap=plt.cm.jet,
                           node_size=[max(10, G_und.nodes[n].get("degree",1)*15) for n in G_und.nodes()], # Aumentei o multiplicador (15) para destacar mais
                           ax=ax)

    # 2. DESENHA SOMENTE OS RÓTULOS FILTRADOS
    nx.draw_networkx_labels(G_und,
                            pos=pos,
                            labels=labels_to_show, # <-- Usa o dicionário filtrado
                            font_size=8,
                            font_color='black', # Use preto se o fundo do nó for vermelho/amarelo
                            ax=ax)

    # ... (Restante do código da barra de cores)
    sm = mpl.cm.ScalarMappable(cmap=plt.cm.jet, norm=mpl.colors.Normalize(vmin=min(vals), vmax=max(vals)))
    sm.set_array([])
    cbar = fig.colorbar(sm, ax=ax)
    cbar.set_label(metric, rotation=270, labelpad=15)
    plt.axis('off')
    if fname:
        plt.savefig(fname, dpi=300, bbox_inches='tight', transparent=True)
    plt.show()

# Chame as funções de plotagem
plot_metric('degree', fname='degree_wiki.png')
plot_metric('betweenness', fname='betweenness_wiki.png')


# Fim do notebook
print('Notebook pronto. Revise parâmetros (MAX_LINKS_PER_PAGE, MAX_DEPTH, SEEDS) antes de rodar para ajustar tamanho do grafo.')


In [10]:
print(f"Grafo mesclado: nós={len(G.nodes())}, arestas={G.number_of_edges()}")
print(f"Tempo total de coleta: {coleta_duration:.2f} segundos")
print(f"Tempo total de calculo de metricas: {metricas_duration:.2f} segundos")

Grafo mesclado: nós=18184, arestas=41477
Tempo total de coleta: 327.39 segundos
Tempo total de calculo de metricas: 2371.49 segundos
