In [1]:
from utils import setrootdir
setrootdir("ppgcc-coautorias")

'Directory ppgcc-coautorias successfully loaded as current working directory.'

In [2]:
import os
import pickle
from pathlib import Path
from itertools import combinations, chain
from collections import Counter

from collections import Counter
from itertools import combinations
from pathlib import Path

from dotenv import load_dotenv

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patheffects as pe

import seaborn as sns
import plotly.graph_objects as go

import holoviews as hv
from bokeh.io.export import export_png, export_svgs

import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities

from src.utils import get_colors, combine_rgb

sns.set_style("whitegrid")


from src.visualization import Visualizer

In [3]:
load_dotenv()

DATASET_DIRECTORY = os.getenv("DATASET_DIRECTORY")
METADATA_FILE = os.getenv("METADATA_FILE")

# 3. Visualization

In [4]:
visualizer = Visualizer(
    data_dir=DATASET_DIRECTORY,
    metadata_file=METADATA_FILE,
)

In [5]:
df_productions = visualizer.read_parquet(step="02-preprocessing", name="productions")
df_productions

Unnamed: 0,production_id,name,citation,lattes_id,institution,production,authors,location,type,year,issn
0,0,Jose Maria Nazar David,"[Jose Maria Nazar David, JOSE MARIA NAZAR DAVI...",3640497501056163,UFJF,Designing collaboration through a web-based gr...,Jose Maria Nazar David,International Journal of Computer Applications...,PERIODICO,2004,09528091
1,0,Jose Maria Nazar David,"[Jose Maria Nazar David, JOSE MARIA NAZAR DAVI...",3640497501056163,UFJF,Designing collaboration through a web-based gr...,Marcos Roberto da Silva Borges,International Journal of Computer Applications...,PERIODICO,2004,09528091
2,1,Jose Maria Nazar David,"[Jose Maria Nazar David, JOSE MARIA NAZAR DAVI...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,Rita Suzana Pitangueira Maciel,JOURNAL OF UNIVERSAL COMPUTER SCIENCE,PERIODICO,2009,0948695X
3,1,Jose Maria Nazar David,"[Jose Maria Nazar David, JOSE MARIA NAZAR DAVI...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,Jose Maria Nazar David,JOURNAL OF UNIVERSAL COMPUTER SCIENCE,PERIODICO,2009,0948695X
4,1,Jose Maria Nazar David,"[Jose Maria Nazar David, JOSE MARIA NAZAR DAVI...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,Michel Oei,JOURNAL OF UNIVERSAL COMPUTER SCIENCE,PERIODICO,2009,0948695X
...,...,...,...,...,...,...,...,...,...,...,...
936713,7069,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, LEANDRO S. ARAÚJO...",6358983442870515,UFF,Gerando base de grafos não isomorfos com seus ...,Luís Felipe Ignácio Cunha,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
936714,7070,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, LEANDRO S. ARAÚJO...",6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,Thiago Lopes Nascimento,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
936715,7070,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, LEANDRO S. ARAÚJO...",6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,Fábio Protti,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
936716,7070,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, LEANDRO S. ARAÚJO...",6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,Luís Felipe Ignácio Cunha,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,


## 3.1. PPGCC data between 2014-2023

## 3.2. Yearly publications

### 3.2.1. Yearly publications by institution

In [None]:
df_yearly_all = df_productions.copy()
df_yearly_all = df_yearly_all[(df_yearly_all["year"] >= 2014) & (df_yearly_all["year"] <= 2023)]
df_yearly_all = df_yearly_all[["year", "type"]]
df_yearly_all = df_yearly_all[df_yearly_all["type"].isin(["CONFERENCIA", "PERIODICO"])]
df_yearly_all = df_yearly_all.groupby(["year", "type"]).size().reset_index(name='count')
df_yearly_all = df_yearly_all.pivot(index="year", columns="type", values="count")
df_yearly_all

In [None]:
fig_yearly_pub_all, ax_yearly_pub_all = plt.subplots(figsize=(8, 5))
df_yearly_all.plot(kind="line", ax=ax_yearly_pub_all, color=["lightcoral", "skyblue"], marker='o', linewidth=2, markersize=8)

ax_yearly_pub_all.set_xlabel("Ano", fontsize=12)
ax_yearly_pub_all.set_ylabel("Total de publicações", fontsize=12)
ax_yearly_pub_all.legend(title="Tipo de publicação", labels=["Conferência", "Periódico"], fontsize=12, title_fontsize=12)

for line in ax_yearly_pub_all.get_lines():
    for x, y in zip(line.get_xdata(), line.get_ydata()):
        ax_yearly_pub_all.text(x, y+50, f"{y}", fontsize=12, ha='center', va='bottom')

ax_yearly_pub_all.yaxis.grid(linestyle='--', which='major', color='grey', alpha=.25)
ax_yearly_pub_all.xaxis.grid(linestyle='--', which='major', color='grey', alpha=.25)

ax_yearly_pub_all.set_xticks(df_yearly_all.index)
ax_yearly_pub_all.set_xticklabels(df_yearly_all.index, rotation=0, fontsize=12)
ax_yearly_pub_all.set_yticklabels(ax_yearly_pub_all.get_yticks().astype(int), fontsize=12)

plt.tight_layout()

plt.savefig(Path(visualization_directory, "yearly_publications_all.png"), dpi=300, bbox_inches='tight')
plt.savefig(Path(visualization_directory, "yearly_publications_all.svg"), format='svg', bbox_inches='tight')
plt.savefig(Path(visualization_directory, "yearly_publications_all.pdf"), format='pdf', bbox_inches='tight')

plt.show()

## 3.3. Yearly coauthorship

In [None]:
df_yearly_coauthorship_all = df_productions.copy()
df_yearly_coauthorship_all = df_yearly_coauthorship_all[(df_yearly_coauthorship_all["year"] >= 2014) & (df_yearly_coauthorship_all["year"] <= 2023)]
df_yearly_coauthorship_all = df_yearly_coauthorship_all[["name", "year", "authors", "type"]]
df_yearly_coauthorship_all["coauthors"] = df_yearly_coauthorship_all.apply(lambda row: [author for author in row["authors"] if author != row["name"]], axis=1)
df_yearly_coauthorship_all["n_coauthors"] = df_yearly_coauthorship_all["coauthors"].apply(len)
df_yearly_coauthorship_all = df_yearly_coauthorship_all[df_yearly_coauthorship_all["type"].isin(["CONFERENCIA", "PERIODICO"])]
df_yearly_coauthorship_all = df_yearly_coauthorship_all[["year", "type", "n_coauthors"]]
df_yearly_coauthorship_all = df_yearly_coauthorship_all.groupby(["year", "type"])["n_coauthors"].sum().reset_index()
df_yearly_coauthorship_all = df_yearly_coauthorship_all.pivot(index="year", columns="type", values="n_coauthors")
df_yearly_coauthorship_all

In [None]:
fig_yearly_coauth_all, ax_yearly_coauth_all = plt.subplots(figsize=(8, 5))

df_yearly_coauthorship_all.plot(
    kind="line", 
    ax=ax_yearly_coauth_all, 
    color=["lightcoral", "skyblue"], marker='o', linewidth=2, markersize=8
)

ax_yearly_coauth_all.set_xlabel("Ano", fontsize=12)
ax_yearly_coauth_all.set_ylabel("Total de coautores", fontsize=12)
ax_yearly_coauth_all.legend(title="Tipo de publicação", labels=["Conferência", "Periódico"], fontsize=12, title_fontsize=12)

for line in ax_yearly_coauth_all.get_lines():
    label = line.get_label()
    for x, y in zip(line.get_xdata(), line.get_ydata()):
        if label == "CONFERENCIA":
            ax_yearly_coauth_all.text(x, y+500, f"{y}", fontsize=12, ha='center', va='bottom')
        elif label == "PERIODICO":
            ax_yearly_coauth_all.text(x, y-1200, f"{y}", fontsize=12, ha='center', va='bottom')

ax_yearly_coauth_all.yaxis.grid(linestyle='--', which='major', color='grey', alpha=.25)
ax_yearly_coauth_all.xaxis.grid(linestyle='--', which='major', color='grey', alpha=.25)

ax_yearly_coauth_all.set_xticklabels(ax_yearly_coauth_all.get_xticklabels(), rotation=0, fontsize=12)
ax_yearly_coauth_all.set_yticklabels(ax_yearly_coauth_all.get_yticks().astype(int), fontsize=12)

plt.tight_layout()
plt.savefig(Path(visualization_directory, "yearly_coauthorship_all.png"), dpi=300, bbox_inches='tight')
plt.savefig(Path(visualization_directory, "yearly_coauthorship_all.svg"), format='svg', bbox_inches='tight')
plt.savefig(Path(visualization_directory, "yearly_coauthorship_all.pdf"), format='pdf', bbox_inches='tight')
plt.show()

## 3.4. Coauthorships by professor

In [None]:
df_coauth_professors = df_productions[["name", "authors"]]
df_coauth_professors["coauthors"] = df_coauth_professors.apply(lambda row: [author for author in row["authors"] if author != row["name"]], axis=1)
df_coauth_professors = df_coauth_professors[["name", "coauthors"]]
df_coauth_professors = df_coauth_professors.explode("coauthors")
df_coauth_professors = df_coauth_professors.drop_duplicates(subset=["name", "coauthors"])
df_coauth_professors = df_coauth_professors.groupby(by="name").size().reset_index(name="n_coauthorships")
df_coauth_professors = df_coauth_professors.sort_values(by="n_coauthorships", ascending=False)
df_coauth_professors

In [None]:
fig_coauth_prof, ax_coauth_prof = plt.subplots(figsize=(8, 5))

sns.barplot(data=df_coauth_professors, x="n_coauthorships", y="name", ax=ax_coauth_prof, palette="viridis")

for index, value in enumerate(df_coauth_professors["n_coauthorships"]):
    ax_coauth_prof.text(value + 1, index, str(value), color='black', va='center')

ax_coauth_prof.set_xlabel("Número de coautorias")
ax_coauth_prof.set_ylabel("Professor")

plt.tight_layout()

plt.savefig(Path(visualization_directory, "coauthorships_per_professor.png"), dpi=300, bbox_inches='tight')
plt.savefig(Path(visualization_directory, "coauthorships_per_professor.svg"), format='svg', bbox_inches='tight')
plt.savefig(Path(visualization_directory, "coauthorships_per_professor.pdf"), format='pdf', bbox_inches='tight')

plt.show()

## 3.5. Publications by professor

In [None]:
df_pub_by_prof = df_productions[["name", "type"]]
df_pub_by_prof = df_pub_by_prof[df_pub_by_prof["type"].isin(["CONFERENCIA", "PERIODICO"])]
df_pub_by_prof = df_pub_by_prof.groupby(["name", "type"]).size().reset_index(name="count")
df_pub_by_prof = df_pub_by_prof.pivot(index="name", columns="type", values="count").fillna(0)
df_pub_by_prof["total"] = df_pub_by_prof.sum(axis=1)
df_pub_by_prof = df_pub_by_prof.sort_values(by=["total", "PERIODICO", "CONFERENCIA"], ascending=False).reset_index()
df_pub_by_prof

In [None]:
df_pub_by_prof.to_csv(Path(visualization_directory, "publications_per_professor.csv"), index=False)

In [None]:
fig_pub_prof, ax_pub_prof = plt.subplots(figsize=(8, 5))
sns.barplot(data=df_pub_by_prof, x="total", y="name", ax=ax_pub_prof, palette="viridis")

for index, value in enumerate(df_pub_by_prof["total"]):
    ax_pub_prof.text(value + 1, index, str(value), color='black', va='center')

ax_pub_prof.set_xlabel("Número de publicações")
ax_pub_prof.set_ylabel("Professor")

plt.tight_layout()

plt.savefig(Path(visualization_directory, "publications_per_professor.png"), dpi=300, bbox_inches='tight')
plt.savefig(Path(visualization_directory, "publications_per_professor.svg"), format='svg', bbox_inches='tight')
plt.savefig(Path(visualization_directory, "publications_per_professor.pdf"), format='pdf', bbox_inches='tight')

plt.show()

In [None]:
df_pub_by_prof = df_pub_by_prof.sort_values("total", ascending=True)

y = range(len(df_pub_by_prof))
conf = df_pub_by_prof["CONFERENCIA"].to_numpy()
peri = df_pub_by_prof["PERIODICO"].to_numpy()
labels = df_pub_by_prof["name"].tolist()
totals = (conf + peri)

fig_pub_prof, ax_pub_prof = plt.subplots(figsize=(8, 5))

bars_conf = ax_pub_prof.barh(y, conf, color="lightcoral", label="Conferência")
bars_peri = ax_pub_prof.barh(y, peri, left=conf, color="skyblue", label="Periódico")


ax_pub_prof.set_yticks(y)
ax_pub_prof.set_yticklabels(labels)


ax_pub_prof.set_xlabel("Número de publicações")
ax_pub_prof.set_ylabel("Professor")
ax_pub_prof.legend(title="Tipo de publicação")


ax_pub_prof.xaxis.grid(linestyle='--', which='major', color='grey', alpha=.25)
ax_pub_prof.yaxis.grid(False)


for i, (c, p, t) in enumerate(zip(conf, peri, totals)):
    if c > 0:
        ax_pub_prof.text(c / 2, i, f"{int(c)}", fontsize=7, ha='center', va='center')
    if p > 0:
        ax_pub_prof.text(c + p / 2, i, f"{int(p)}", fontsize=7, ha='center', va='center')
    ax_pub_prof.text(t + max(totals) * 0.01 + 1, i, f"{int(t)}", fontsize=7, ha='left', va='center')

plt.tight_layout()

plt.savefig(Path(visualization_directory, "publications_per_professor.png"), dpi=300, bbox_inches='tight')
plt.savefig(Path(visualization_directory, "publications_per_professor.svg"), format='svg', bbox_inches='tight')
plt.savefig(Path(visualization_directory, "publications_per_professor.pdf"), format='pdf', bbox_inches='tight')

plt.show()

## 3.6. Clusters of researchers

In [None]:
ids_professors = {}

for name in sorted(df_productions["name"].unique()):
    initials = ''.join([part[0] for part in name.split() if part[0].isupper()])
    ids_professors[name] = initials

ids_professors

In [None]:
df_clusters_prof = df_productions[["name", "authors"]]
df_clusters_prof = df_clusters_prof.explode("authors")
df_clusters_prof = df_clusters_prof[df_clusters_prof["name"] != df_clusters_prof["authors"]]
df_clusters_prof = df_clusters_prof[df_clusters_prof["authors"].isin(set(ids_professors.keys()))]
df_clusters_prof = df_clusters_prof.map(lambda x: ids_professors[x] if x in ids_professors else x)
df_clusters_prof = df_clusters_prof.groupby(by=["name", "authors"]).size().reset_index(name="n_coauthorships")
df_clusters_prof.columns = ["source", "target", "n_coauthorships"]
df_clusters_prof = df_clusters_prof[df_clusters_prof["n_coauthorships"] > 1]
df_clusters_prof

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from networkx.algorithms.community import greedy_modularity_communities
from networkx.drawing.layout import rescale_layout
from pathlib import Path

G = nx.from_pandas_edgelist(
    df_clusters_prof,
    source="source",
    target="target",
    edge_attr="n_coauthorships"
)

weights = [d["n_coauthorships"] for _, _, d in G.edges(data=True)]
edge_widths = [np.log1p(w) for w in weights]

communities = list(greedy_modularity_communities(G))
communities = sorted(communities, key=lambda c: (-len(c), sorted(c)[0]))

node_to_comm = {}
for idx, comm in enumerate(communities):
    for n in comm:
        node_to_comm[n] = idx

n_comms = len(communities)
palette = sns.color_palette("pastel", n_comms)
node_colors = [palette[node_to_comm[n]] for n in G.nodes]

labels_list = [
    "A","B","C","D","E","F","G","H","I","J","K","L","M",
    "N","O","P","Q","R","S","T","U","V","W","X","Y","Z",
    "α","β"
]
labels = {n: labels_list[i] for i, n in enumerate(G.nodes)}

CG = nx.Graph()
CG.add_nodes_from(range(n_comms))
for u, v, d in G.edges(data=True):
    cu = node_to_comm[u]
    cv = node_to_comm[v]
    if cu != cv:
        w = d.get("n_coauthorships", 1)
        if CG.has_edge(cu, cv):
            CG[cu][cv]["weight"] += w
        else:
            CG.add_edge(cu, cv, weight=w)

pos_comm = nx.kamada_kawai_layout(CG, weight="weight") if CG.number_of_edges() > 0 else {
    i: (np.cos(2*np.pi*i/n_comms), np.sin(2*np.pi*i/n_comms)) for i in range(n_comms)
}

pos = {}
max_size = max(len(c) for c in communities) if communities else 1
for ci, comm in enumerate(communities):
    subG = G.subgraph(comm).copy()
    sub_pos = nx.spring_layout(subG, k=1)
    sub_pos = {n: np.array(p) for n, p in sub_pos.items()}
    arr = np.array(list(sub_pos.values()))
    arr = rescale_layout(arr, scale=1.0)
    scale = 0.25 + 0.15*(len(subG)/max_size)
    center = np.array(pos_comm[ci])
    for (n, _), p in zip(sub_pos.items(), arr):
        pos[n] = center + p*scale

plt.figure(figsize=(8, 5))

nx.draw_networkx_nodes(
    G, pos,
    node_size=200,
    node_color=node_colors
)

nx.draw_networkx_edges(
    G, pos,
    width=edge_widths,
    edge_color="gray",
    alpha=0.6
)

nx.draw_networkx_labels(
    G, pos,
    labels=labels,
    font_size=9,
)

edge_labels = nx.get_edge_attributes(G, "n_coauthorships")
nx.draw_networkx_edge_labels(
    G, pos,
    edge_labels=edge_labels,
    font_size=8,
    bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="white", alpha=0.3)
)

plt.axis("off")
plt.tight_layout()
plt.savefig(Path(visualization_directory, "coauthorship_network_communities.png"), dpi=300, bbox_inches='tight')
plt.savefig(Path(visualization_directory, "coauthorship_network_communities.svg"), format='svg', bbox_inches='tight')
plt.savefig(Path(visualization_directory, "coauthorship_network_communities.pdf"), format='pdf', bbox_inches='tight')
plt.show()

### 3.6.1. Clusters of publications

In [None]:
df_clusters_inst = df_productions[["name", "production_id", "institution", "authors", "year"]]
df_clusters_inst = df_clusters_inst[(df_clusters_inst["year"] >= 2014) & (df_clusters_inst["year"] <= 2023)]
df_clusters_inst = df_clusters_inst.drop(columns=["year"])
df_clusters_inst

In [None]:
all_authors_cited = set()

for authors in df_clusters_inst["authors"]:
    for author in authors:
        all_authors_cited.add(author)

all_authors_cited = sorted(all_authors_cited)
len(all_authors_cited)

In [None]:
all_authors_registered = set()

for author in df_clusters_inst["name"].unique():
    all_authors_registered.add(author)

all_authors_registered = sorted(all_authors_registered)
len(all_authors_registered)

In [None]:
data_author_inst = {
    "name": [],
    "institution": []
}

for author_cited in all_authors_cited:
    if author_cited in all_authors_registered:
        institution = df_clusters_inst[df_clusters_inst["name"] == author_cited]["institution"].values[0]
        data_author_inst["name"].append(author_cited)
        data_author_inst["institution"].append(institution)
    else:
        continue

df_author_inst = pd.DataFrame(data_author_inst)
df_author_inst = df_author_inst.sort_values(by="name").reset_index(drop=True)
df_author_inst

In [None]:
df_clusters_inst["authors_institution"] = df_clusters_inst["authors"].apply(
    lambda authors: [df_author_inst[df_author_inst["name"] == author]["institution"].values[0] for author in authors if author in set(df_author_inst["name"])]
)

df_clusters_inst = df_clusters_inst[df_clusters_inst["authors_institution"].apply(len) > 1]
df_clusters_inst["has_all_authors_institution"] = df_clusters_inst.apply(lambda row: len(row["authors_institution"]) == len(row["authors"]), axis=1)
df_clusters_inst

In [None]:
df_clusters_inst[df_clusters_inst["has_all_authors_institution"] == True]

In [None]:
df_clusters_inst[df_clusters_inst["has_all_authors_institution"] == True]["authors_institution"].apply(len).value_counts()

In [None]:
df_clusters_inst_relations = df_clusters_inst[["production_id", "authors_institution"]]
df_clusters_inst_relations

In [None]:
def powerset(iterable, degree):
    s = list(iterable)
    powerset_list = chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))
    return [p for p in powerset_list if len(p) == degree]

In [None]:
inst_relations = {}

for index, row in df_clusters_inst_relations.iterrows():
    institutions = row["authors_institution"]
    if len(institutions) < 2:
        continue
    pairs = powerset(institutions, 2)
    for pair in pairs:
        pair = tuple(sorted(pair))
        if pair in inst_relations:
            inst_relations[pair] += 1
        else:
            inst_relations[pair] = 1

len(inst_relations)

In [None]:
data_inst_relations = {
    "source": [],
    "target": [],
    "n_coauthorships": []
}

for (inst1, inst2), n_coauth in inst_relations.items():
    data_inst_relations["source"].append(inst1)
    data_inst_relations["target"].append(inst2)
    data_inst_relations["n_coauthorships"].append(n_coauth)

df_inst_relations = pd.DataFrame(data_inst_relations)
df_inst_relations = df_inst_relations.sort_values(by="n_coauthorships", ascending=False).reset_index(drop=True)
df_inst_relations

In [None]:
df_inst_relations_cut = df_inst_relations[df_inst_relations["source"] != df_inst_relations["target"]]
df_inst_relations_cut = df_inst_relations_cut[df_inst_relations_cut["n_coauthorships"] >= 400]
df_inst_relations_cut

In [None]:
G = nx.from_pandas_edgelist(
    df_inst_relations_cut,
    source="source",
    target="target",
    edge_attr="n_coauthorships"
)

communities = list(greedy_modularity_communities(G))
communities = sorted(communities, key=lambda c: (-len(c), sorted(c)[0]))

node_to_comm = {}
for idx, comm in enumerate(communities):
    for n in comm:
        node_to_comm[n] = idx

n_comms = len(communities)
palette = sns.color_palette("pastel", n_comms)

C = nx.Graph()
C.add_nodes_from(range(n_comms))
for u, v, d in G.edges(data=True):
    cu, cv = node_to_comm[u], node_to_comm[v]
    if cu != cv:
        w = d.get("n_coauthorships", 1)
        if C.has_edge(cu, cv):
            C[cu][cv]["weight"] += w
        else:
            C.add_edge(cu, cv, weight=w)

pos_comm = nx.circular_layout(C, scale=5.0)

pos = {}
for c_idx, comm in enumerate(communities):
    sub = G.subgraph(comm)

    base = 1.0 / np.sqrt(max(len(sub), 1))
    k_sub = max(1.2, base * 2.0)

    pos_sub = nx.spring_layout(
        sub,
        k=k_sub,
        iterations=200,
        # seed=42,
        weight="n_coauthorships"
    )

    sub_xy = np.array(list(pos_sub.values()))
    if len(sub_xy) > 0:
        sub_xy = sub_xy - sub_xy.mean(axis=0, keepdims=True)
        scale = 1.0 + 0.35 * np.log1p(len(sub))
        sub_xy = sub_xy * scale

    cx, cy = pos_comm.get(c_idx, (0.0, 0.0))
    for i, n in enumerate(sub.nodes()):
        if len(sub_xy) > 0:
            pos[n] = (sub_xy[i, 0] + cx, sub_xy[i, 1] + cy)
        else:
            pos[n] = (cx, cy)


weights = [d["n_coauthorships"] for _, _, d in G.edges(data=True)]
edge_widths = [np.log1p(w) for w in weights]

plt.figure(figsize=(12, 6))

nx.draw_networkx_edges(
    G, pos,
    width=edge_widths,
    edge_color="gray",
    alpha=0.6,
)

node_colors = {n: palette[node_to_comm[n]] for n in G.nodes}
labels = {n: str(n) for n in G.nodes}
for n, (x, y) in pos.items():
    plt.text(
        x, y, labels[n],
        ha="center", va="center",
        fontsize=10,
        bbox=dict(boxstyle="square,pad=0.28", fc=node_colors[n], ec="black", lw=0.6)
    )

edge_labels = nx.get_edge_attributes(G, "n_coauthorships")
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=9, rotate=False, bbox=dict(fc="white", ec="white", lw=0.5, alpha=0.3))

plt.axis("off")
plt.tight_layout()
plt.savefig(Path(visualization_directory, "coauthorship_institutions_communities.png"), dpi=300, bbox_inches='tight')
plt.savefig(Path(visualization_directory, "coauthorship_institutions_communities.svg"), format='svg', bbox_inches='tight')
plt.savefig(Path(visualization_directory, "coauthorship_institutions_communities.pdf"), format='pdf', bbox_inches='tight')
plt.show()

## 3.7. Sankey of researchers

In [None]:
initials = df_clusters_prof["source"].tolist() + df_clusters_prof["target"].tolist()
initials = sorted(list(set(initials)))
print(len(initials))
initials

In [None]:
labels = []

for initial in initials:
    labels.append(f"S-{initial}")

for initial in initials:
    labels.append(f"T-{initial}")

labels

In [None]:
labels_clean = [label.split('-')[1] for label in labels]

In [None]:
colors = get_colors(len(initials))

color_mapping = {initial: color for initial, color in zip(labels, colors+colors)}
color_mapping

In [None]:
def combine_rgb(rgb_colors):
    num_colors = len(rgb_colors)
    if num_colors == 0:
        return (0, 0, 0)
    
    avg_r = sum(c[0] for c in rgb_colors) // num_colors
    avg_g = sum(c[1] for c in rgb_colors) // num_colors
    avg_b = sum(c[2] for c in rgb_colors) // num_colors
    return (avg_r, avg_g, avg_b)

In [None]:
def hex2rgb(hex_color: str):
    hex_color = hex_color.lstrip("#")
    return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))

In [None]:
edge_colors = []

for i, row in df_clusters_prof.iterrows():
    color_1 = color_mapping[f"S-{row['source']}"]
    color_2 = color_mapping[f"T-{row['target']}"]

    color_1 = hex2rgb(color_1)
    color_2 = hex2rgb(color_2)

    edge_color = combine_rgb([color_1, color_2])
    edge_colors.append(f"rgb{edge_color}")

In [None]:
source_indices = [labels.index(f"S-{src}") for src in df_clusters_prof["source"]]
source_colors = [color_mapping[f"S-{src}"] for src in df_clusters_prof["source"]]

target_indices = [labels.index(f"T-{tgt}") for tgt in df_clusters_prof["target"]]
target_colors = [color_mapping[f"T-{tgt}"] for tgt in df_clusters_prof["target"]]

values = df_clusters_prof["n_coauthorships"].tolist()

fig_sankey = go.Figure(
    data=[
        go.Sankey(
            node = dict(
                pad = 15,
                thickness = 20,
                line = dict(color="black", width=0.5),
                label = labels_clean,
                color = list(color_mapping.values())
            ),
            link = dict(
                source = source_indices,
                target = target_indices,
                value = values,
                color = edge_colors
            )
        )
    ]
)

fig_sankey.update_layout(
    width=1920,
    height=1080,
)

fig_sankey.show(renderer="png")

### 3.7.1. One-to-many sankey

In [None]:
df_clusters_prof

In [None]:
for s_prof in sorted(df_clusters_prof["source"].unique()):
    df_clusters_prof_unique = df_clusters_prof[df_clusters_prof["source"] == s_prof]

    source_indices = [labels.index(f"S-{src}") for src in df_clusters_prof_unique["source"]]
    source_colors = [color_mapping[f"S-{src}"] for src in df_clusters_prof_unique["source"]]

    target_indices = [labels.index(f"T-{tgt}") for tgt in df_clusters_prof_unique["target"]]
    target_colors = [color_mapping[f"T-{tgt}"] for tgt in df_clusters_prof_unique["target"]]

    values = df_clusters_prof_unique["n_coauthorships"].tolist()

    fig_sankey = go.Figure(
        data=[
            go.Sankey(
                node = dict(
                    pad = 15,
                    thickness = 20,
                    line = dict(color="black", width=0.5),
                    label = labels_clean,
                    color = list(color_mapping.values())
                ),
                link = dict(
                    source = source_indices,
                    target = target_indices,
                    value = values,
                    color = edge_colors
                )
            )
        ]
    )

    fig_sankey.update_layout(
        width=1280,
        height=720,
        title_text=f"Coautorias do professor {s_prof}",
        title_x=0.5
    )

    fig_sankey.write_image(Path(visualization_directory, "sankey", f"sankey_{s_prof}.png"), scale=2)

## 3.8. Sankey of institution coauthorship

In [None]:
institution_reference = {row["name"]: row["institution"] for _, row in df_productions.iterrows()}

In [None]:
# df_institution_coauth = df_productions[["production_id", "authors"]]
df_institution_coauth = df_productions[["production_id", "authors", "year"]]
df_institution_coauth = df_institution_coauth[df_institution_coauth["year"] == 2023]
df_institution_coauth = df_institution_coauth.drop(columns=["year"])
df_institution_coauth["institutions"] = df_institution_coauth["authors"].apply(lambda authors: [institution_reference[author] for author in authors if author in institution_reference])
df_institution_coauth = df_institution_coauth.drop(columns=["authors"])
df_institution_coauth["institutions"] = df_institution_coauth["institutions"].apply(set)
df_institution_coauth = df_institution_coauth[df_institution_coauth["institutions"].apply(len) > 1]
df_institution_coauth.to_csv(Path(visualization_directory, "institution_coauthorships.csv"), index=False)
df_institution_coauth

In [None]:
df_institution_coauth2 = df_institution_coauth[["institutions"]]
df_institution_coauth2["institutions"] = df_institution_coauth2["institutions"].map(lambda x: " - ".join(x))
df_institution_coauth2 = df_institution_coauth2.groupby("institutions").size().sort_values(ascending=False)
df_institution_coauth2

In [None]:
coauths = [coauth.split(" - ") for coauth in df_institution_coauth2.iloc[:8].index.to_list()]

coauths_institutions = [institution for coauth in coauths for institution in coauth]
coauths_institutions = set( coauths_institutions )
coauths_institutions

In [None]:
df_institution_coauth3 = df_institution_coauth.copy()

df_institution_coauth3["belongs"] = df_institution_coauth3["institutions"].map(lambda x: x.difference(set(coauths_institutions)) == set())
df_institution_coauth = df_institution_coauth3[df_institution_coauth3["belongs"] == True]
df_institution_coauth

In [None]:
hv.extension("bokeh")

edge_counts = Counter()
all_nodes = set()

for insts in df_institution_coauth["institutions"]:
    unique_insts = {str(x).strip() for x in insts if pd.notna(x) and str(x).strip()}
    all_nodes.update(unique_insts)
    for u, v in combinations(sorted(unique_insts), 2):
        edge_counts[(u, v)] += 1

G = nx.Graph()
G.add_nodes_from(sorted(all_nodes))
for (u, v), w in edge_counts.items():
    G.add_edge(u, v, weight=w)

MIN_W = 1
to_drop = [(u, v) for u, v, d in G.edges(data=True) if d["weight"] < MIN_W]
G.remove_edges_from(to_drop)
G.remove_nodes_from(list(nx.isolates(G)))

if len(G) == 0 or G.number_of_edges() == 0:
    chord = hv.Chord([]).opts(width=600, height=600)
else:
    nodes_sorted = sorted(G.nodes())
    idx = {n: i for i, n in enumerate(nodes_sorted)}

    nodes_df = pd.DataFrame({
        "index": [idx[n] for n in nodes_sorted],
        "name": nodes_sorted,
    })

    links_df = pd.DataFrame([
        {"source": idx[u], "target": idx[v], "value": d.get("weight", 1)}
        for u, v, d in G.edges(data=True)
    ])

    nodes_ds = hv.Dataset(nodes_df, kdims="index")

    chord = hv.Chord((links_df, nodes_ds)).opts(
        width=600,
        height=600,
        labels="name",
        cmap="Category20",
        edge_color="source",
        node_color="index",
    )

renderer = hv.renderer("bokeh")
bokeh_plot = renderer.get_plot(chord).state

png_path = Path(visualization_directory) / "institution_coauthorship_network.png"
export_png(bokeh_plot, filename=str(png_path))

bokeh_plot.output_backend = "svg"
svg_path = Path(visualization_directory) / "institution_coauthorship_network.svg"
export_svgs(bokeh_plot, filename=str(svg_path))

try:
    import cairosvg
    pdf_path = Path(visualization_directory) / "institution_coauthorship_network.pdf"
    cairosvg.svg2pdf(url=str(svg_path), write_to=str(pdf_path))
except Exception:
    pass

chord