In [1]:
import networkx as nx
from helpers import read_jsonl, meta_path, texts_path

In [2]:
def create_edges(citations: dict) -> list:
    edges = []
    for node in citations:
        for parent in citations[node]["inbound_citations"]:
            edges.append((parent, node))
        for child in citations[node]["outbound_citations"]:
            edges.append((node, child))
    return edges

In [3]:
relevant_papers = {paper["paper_id"] for paper in read_jsonl(texts_path)}

In [4]:
metadata = {}
for paper in read_jsonl(meta_path):
    _id = paper["paper_id"]
    date_published = paper["year"]
    if _id in relevant_papers and date_published is not None and date_published >= 1991:
        metadata[_id] = dict(
            inbound_citations=paper["inbound_citations"],
            outbound_citations=paper["outbound_citations"],
        )
        
len(metadata)

9145

In [5]:
edges = create_edges(metadata)

G = nx.DiGraph()
G.add_edges_from(edges)

## Measuring centrality

We assume that the more central a publication is located within our network, the more important it is

In [6]:
def calculate_centrality(G, method, limit=None):
    lookup = {
        "degree_centrality": nx.degree_centrality,
        "closeness_centrality": nx.closeness_centrality, 
        "betweenness_centrality": nx.betweenness_centrality,
    }
    dc = lookup[method](G)
    centrality = dict(sorted(dc.items(), key=lambda item: item[1], reverse=True))
    if limit is None:
        return centrality
    ids = list(centrality.keys())[:limit]
    return {k:v for k,v in centrality.items() if k in ids}

In [7]:
centrality = calculate_centrality(G, method="degree_centrality", limit=10)
centrality

{'54843082': 0.00847926957042359,
 '54634769': 0.004333183655141446,
 '154714183': 0.0031282742104475378,
 '18155508': 0.0028139500074839097,
 '154535201': 0.0027989821882951657,
 '31366427': 0.0026567879060020957,
 '11744346': 0.0025894327196527467,
 '66288': 0.002537045352492142,
 '154212598': 0.002462206256548421,
 '15952453': 0.002252656787906002}

In [8]:
# most central paper
metadata[max(centrality, key=centrality.get)]

{'inbound_citations': ['153452739',
  '55341161',
  '145757980',
  '143884581',
  '147598599',
  '145467697',
  '31127674',
  '149910054',
  '209394140',
  '171091283',
  '54651782',
  '55685205',
  '26602744',
  '114968236',
  '167974493',
  '146858603',
  '143744730',
  '86024696',
  '143692172',
  '110300645',
  '145257913',
  '55548868',
  '151302065',
  '55906300',
  '202673625',
  '149028401',
  '20969212',
  '51834967',
  '142941623',
  '142712735',
  '56126312',
  '51916442',
  '109911891',
  '27124122',
  '149719886',
  '149598249',
  '146762649',
  '53591607',
  '154417657',
  '157526427',
  '54497152',
  '30681408',
  '142753400',
  '155594836',
  '145754878',
  '52265086',
  '109815548',
  '201038002',
  '84836084',
  '151856672',
  '149188189',
  '56166216',
  '145423454',
  '57569660',
  '55882682',
  '145443221',
  '199470828',
  '30346319',
  '145647840',
  '54648484',
  '156595779',
  '149238687',
  '76651513',
  '140998744',
  '202547293',
  '142993292',
  '153666174'