In [1]:
import networkx as nx
from helpers import read_jsonl, meta_path, texts_path, PROCESSED

In [2]:
def create_edges(citations: dict) -> list:
    edges = []
    for node in citations:
        for parent in citations[node]["inbound_citations"]:
            if parent in citations.keys():
                edges.append((parent, node))
        for child in citations[node]["outbound_citations"]:
            if child in citations.keys():
                edges.append((node, child))
    return edges

In [3]:
relevant_papers = {paper["paper_id"] for paper in read_jsonl(texts_path)}

In [4]:
metadata = {}
for paper in read_jsonl(meta_path):
    _id = paper["paper_id"]
    date_published = paper["year"]
    if _id in relevant_papers and date_published is not None and date_published >= 1991:
        metadata[_id] = dict(
            inbound_citations=paper["inbound_citations"],
            outbound_citations=paper["outbound_citations"],
        )
        
len(metadata)

9145

In [5]:
edges = create_edges(metadata)

G = nx.DiGraph()
G.add_edges_from(edges)

In [6]:
nx.write_gexf(G, PROCESSED / "210808-gephi.gexf")

## Measuring centrality

We assume that the more central a publication is located within our network, the more important it is

In [7]:
def calculate_centrality(G, method, limit=None):
    lookup = {
        "degree_centrality": nx.degree_centrality,
        "closeness_centrality": nx.closeness_centrality, 
        "betweenness_centrality": nx.betweenness_centrality,
    }
    dc = lookup[method](G)
    centrality = dict(sorted(dc.items(), key=lambda item: item[1], reverse=True))
    if limit is None:
        return centrality
    ids = list(centrality.keys())[:limit]
    return {k:v for k,v in centrality.items() if k in ids}

In [8]:
centrality = calculate_centrality(G, method="betweenness_centrality", limit=10)
centrality

{'3354592': 7.4156470152020756e-06,
 '159279438': 7.4156470152020756e-06,
 '53065201': 4.943764676801384e-06,
 '195490641': 4.943764676801384e-06,
 '55914559': 4.943764676801384e-06,
 '39342543': 4.943764676801384e-06,
 '55694829': 4.943764676801384e-06,
 '53135628': 3.7078235076010378e-06,
 '55024365': 2.471882338400692e-06,
 '29322734': 2.471882338400692e-06}

In [9]:
# most central paper
metadata[max(centrality, key=centrality.get)]

{'inbound_citations': ['3235947', '195490641', '4564427'],
 'outbound_citations': ['48451139',
  '155492710',
  '23331735',
  '22266624',
  '53546359',
  '20144768',
  '59146960',
  '154457832',
  '42295037',
  '155004773',
  '1675591',
  '145052632',
  '153346601',
  '155014357',
  '153563125',
  '32911950',
  '153507874',
  '5008492',
  '6107989',
  '43858142',
  '24536392',
  '35947628',
  '158837917',
  '154800815',
  '19338547',
  '157282379',
  '159279438',
  '154874851',
  '152762905',
  '22128517',
  '936875',
  '154325094',
  '154001332',
  '10760950',
  '20083357',
  '151102473',
  '4107075',
  '168069164',
  '154267120',
  '153422929',
  '6317075',
  '40227257',
  '12431554',
  '15211782',
  '154779638',
  '155179130',
  '126749605',
  '198320739',
  '55301358',
  '1906485',
  '11309762',
  '154230677',
  '157270565']}

# Community detection



In [10]:
from networkx.algorithms import community

In [11]:
# Girvan–Newman method
communities_generator = community.girvan_newman(G)
top_level_communities = next(communities_generator)

In [12]:
tuple(sorted(c) for c in top_level_communities)

(['54196724', '55910675'],
 ['54088067', '56383959', '59123355'],
 ['18767993', '211397564', '52064038', '7327300'],
 ['153623502', '28013613'],
 ['145262654', '158992653', '55024365', '59417790'],
 ['197828918', '59419400'],
 ['18649231', '52134745', '77699598'],
 ['152583680', '15618887', '55294335'],
 ['145260271', '152513305', '154618875', '154837519', '49386315'],
 ['210965542', '211741379'],
 ['154856882', '56050575'],
 ['150383584'],
 ['142498216', '43795574'],
 ['53344538'],
 ['37145156', '54080666'],
 ['150701262', '37985621'],
 ['14621366', '155076493'],
 ['147079461', '201363992'],
 ['158277288', '158509882', '210900772', '214502267'],
 ['153781623'],
 ['19344004',
  '32458550',
  '32961304',
  '45872353',
  '53395363',
  '55438065',
  '56353776'],
 ['153178289', '200085790'],
 ['148997571', '195503378'],
 ['150464724', '152455214'],
 ['153801675', '155582058'],
 ['153946252', '155038758'],
 ['152999432', '181461984'],
 ['12045602',
  '153736153',
  '159163646',
  '169689619

In [13]:
# asyn_fluidc
asyn_communities = asyn_fluidc(G)

NameError: name 'asyn_fluidc' is not defined

In [None]:
asyn_communities