In [1]:
import pandas as pd
import networkx as nx
import itertools
from tqdm import tqdm
import numpy as np

In [7]:

# Read data_clean
df = pd.read_csv("../data_light/data_clean.csv")

# Group researchers per publication
groups = df.groupby('pub_id')['researcher_id'].apply(list)

# create a dictionary to store the collaboration count for each pair of authors
collaboration_count = {}

# count the collaborations among authors
for group in groups:
    for pair in itertools.combinations(group, 2):
        if pair in collaboration_count:
            collaboration_count[pair] += 1
        else:
            collaboration_count[pair] = 1

# create the graph
G = nx.Graph()

# add the nodes
for author in df['researcher_id'].unique():
    G.add_node(author)

# add the weighted edges
for pair, weight in collaboration_count.items():
    author1, author2 = pair
    G.add_edge(author1, author2, weight=weight)

nx.write_graphml(G, "graphs/all.graphml")


In [8]:
# no edges and nodes
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

weights = [w for _, _, w in G.edges(data='weight')]
# print mean and std of weights using numpy
print(f"Mean of weights: {np.mean(weights)}")
print(f"Std of weights: {np.std(weights)}")
print(f"Number of connected components: {nx.number_connected_components(G)}")

density = nx.density(G)
print("Graph density:", density)

# Compute the global clustering coefficient
clustering_coefficient = nx.average_clustering(G)
print("Global clustering coefficient:", clustering_coefficient)


Number of nodes: 145878
Number of edges: 1352110
Mean of weights: 1.1707094836958531
Std of weights: 0.7662695839529495
Number of connected components: 6255
Graph density: 0.0001270765444306464
Global clustering coefficient: 0.7970357693090672
