## Building Graph of Researchers

In [58]:
import os
import json
import networkx as nx
import pickle

In [59]:
def add_paper_to_graph(graph, paper):
    """
    Add a paper to the graph. Nodes are authors, and edges represent collaborations on the paper.
    """
    # Extract author names (handle both list of strings and list of dicts)
    authors_raw = paper.get("authors", [])
    authors = []
    for author in authors_raw:
        if isinstance(author, dict) and "name" in author:
            authors.append(author["name"])
        elif isinstance(author, str):
            authors.append(author)

    arxiv_id = paper.get("arxivId", "Unknown Arxiv ID")
    title = paper.get("title", "Unknown Title")
    
    # Add nodes and update their attributes
    for author in authors:
        if author not in graph:
            graph.add_node(author, papers=[])
        # Add this paper (arxivId, title) to the author's list of papers
        graph.nodes[author]["papers"].append((arxiv_id, title))
    
    # Add edges for each pair of authors in the paper
    for i, author1 in enumerate(authors):
        for j, author2 in enumerate(authors):
            if i >= j:  # Avoid adding self-loops and duplicate edges
                continue
            
            # Add edge with paper attributes
            graph.add_edge(author1, author2, 
                           title=title,
                           year=paper.get("year"),
                           authors=authors,
                           doi=paper.get("doi"),
                           fieldsOfStudy=paper.get("fieldsOfStudy"),
                           influencialCitationCount=paper.get("influencialCitationCount"),
                           isOpenAccess=paper.get("isOpenAccess"),
                           numCitedBy=paper.get("numCitedBy"),
                           numCiting=paper.get("numCiting"),
                           arxivId=arxiv_id,
                           paperId=paper.get("paperId"))

In [60]:
G = nx.MultiGraph()

# Path to the directory containing JSON files
directory_path = "paper_jsons/metadata"

skipped_counter = 0
# Iterate through all JSON files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".json"):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, "r") as file:
            try:
                # Load JSON
                paper = json.load(file)
                
                # Skip empty JSON objects
                if not paper:
                    print(f"Skipping file {file_path}: Empty JSON file.")
                    continue
                
                add_paper_to_graph(G, paper)
            except json.JSONDecodeError:
                skipped_counter += 1
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

print(f"\nSkipped {skipped_counter} files due to Invalid JSON format or empty file")
print(f"Graph has {len(G.nodes)} nodes and {len(G.edges)} edges.")


Skipped 38 files due to Invalid JSON format or empty file
Graph has 18120 nodes and 49192 edges.


#### Save raw graph

In [None]:
# Save graph
name = "researchers_graph_raw.pkl"
with open(name, "wb") as f:
    pickle.dump(G, f)
print(f"Graph saved to {name}")

#### Check number of isolated nodes

In [61]:
# Identify isolated nodes in a MultiGraph (nodes with no edges)
isolated_nodes = [node for node, degree in G.degree() if degree == 0]

print(f"Isolated nodes: {isolated_nodes}")
print(f"Number of isolated nodes: {len(isolated_nodes)}")

Isolated nodes: ['Daegene Song', 'K. Sgarbas', 'Wilson Wong', 'S. Konstantopoulos', 'J. V. D. Bussche', 'I. Suslov', 'Alejandro Chinea Manrique De Lara', 'Patrizio Frosini', 'A. Berrones', 'C. Vidal', 'Hang T. Dinh', 'Catherine Recanati', 'F. Murtagh', 'A. Adamatzky', 'Riccardo Alberti', 'Konstantin P. Wishnevsky', 'E. Diamant', 'Eric Engle', 'M. Geiger', 'A. Pereira', 'M. Horvat', 'N. Kumar', 'P. Zizzi', 'Janardan Misra', 'Ernesto Diaz-Aviles', 'Y. Freund', 'Gilles Champenois', 'D. Le', 'K. Ammon', 'G. Paiva', 'York Sure', 'Ping Li', 'Robert Shour', 'Jonathan Timmis', 'Rüdiger Vaas', 'J. Han', 'A. D. Franco', 'Charles A. B. Robert', 'Evgeny Chutchev', 'P. Werbos', 'Dariusz M Plewczynski', 'J. Burger', 'Kush Agrawal', 'Shubham Chakraborty', 'M. I. Jordan', 'Eray Özkural', 'Ladislau Bölöni', 'P. Resnik', 'T. Hogg', 'R. Rosati', 'Radford M. Neal', 'M. Cristani', 'Eray Ozkural', 'N. M. Loghmani', 'Paola Di Maio', 'R. E. Kent', 'D. McDermott', 'David E. Smith', 'Amitabha Roy', 'Michael Dey

#### Filter self-loops

In [None]:
# Identify self-loops in the graph
self_loops = [(u, v, k) for u, v, k in G.edges(keys=True) if u == v]

# Print the self-loops
print("Self-loops in the graph:")
for u, v, k in self_loops:
    print(f"Self-loop: ({u}, {v}, key={k})")

# Remove the self-loops
G.remove_edges_from(self_loops)

print("\nSelf-loops have been removed.")

Self-loops in the graph:
Self-loop: (A. Coles, A. Coles, key=0)
Self-loop: (J. Wolff, J. Wolff, key=0)
Self-loop: (Iztok Fister, Iztok Fister, key=0)
Self-loop: (Iztok Fister, Iztok Fister, key=1)
Self-loop: (C. Turcu, C. Turcu, key=0)
Self-loop: (Ian J. Taylor, Ian J. Taylor, key=0)
Self-loop: (C. Blease, C. Blease, key=0)
Self-loop: (Cosima Locher, Cosima Locher, key=0)
Self-loop: (Jian Liu, Jian Liu, key=0)
Self-loop: (Giancarlo Maraﬁoti, Giancarlo Maraﬁoti, key=0)
Self-loop: (Ieee B Member, Ieee B Member, key=0)
Self-loop: (Ningyu Zhang, Ningyu Zhang, key=0)
Self-loop: (D. Das, D. Das, key=0)
Self-loop: (S. Ruggieri, S. Ruggieri, key=0)
Self-loop: (Amit Sheth, Amit Sheth, key=0)
Self-loop: (Lukás Sekanina, Lukás Sekanina, key=0)
Self-loop: (E. Németh, E. Németh, key=0)
Self-loops have been removed.


#### Extract Giant Connected Component

In [65]:
connected_components = list(nx.connected_components(G))

# Identify the largest connected component (the giant component)
giant_component = max(connected_components, key=len)

# Create a subgraph for the giant component
giant_subgraph = G.subgraph(giant_component).copy()

# Print some statistics about the giant component
print(f"Giant component has {len(giant_subgraph.nodes)} nodes and {len(giant_subgraph.edges)} edges.")

Giant component has 3675 nodes and 16709 edges.


#### Save graph in a pickle file

In [67]:
# Save graph
name = "researchers_graph_gcc.pkl"
with open(name, "wb") as f:
    pickle.dump(giant_subgraph, f)
print(f"Graph saved to {name}")

Graph saved to researchers_graph_gcc.pkl
