### Tutorial 2 - Building a citation network
 - Loading the query results with edges file
 - Building a citation network
 - Exploring the network

#### Packages needed

In [68]:
from tqdm.auto import tqdm
import pandas as pd
import xnetwork as xnet
import numpy as np
import igraph as ig
import sys
import os
from pathlib import Path
import matplotlib.pyplot as plt
%matplotlib widget

### Set the query ID here

In [69]:
queryID = "DEMO_NatureJournals"

# queryID = "DEMO_Networks"
# or use the queryID provided by a query, such as
# queryID = "Nature_0e367ff2-8670-454b-9b31-ca4d0d85fe96"

#### Setting up some folders and paths

In [70]:
os.makedirs("networks",exist_ok=True)
os.makedirs("figures", exist_ok=True)

#### Fields available for MAG queries

In [71]:
MAGColumnTypes = {
    "Paper_paperId":int,
    'Affiliation_displayName': str,
    'Author_authorId': str,
    'Author_rank': str,
    'Author_normalizedName': str,
    'Author_lastKnownAffiliationId': str,
    'Paper_bookTitle': str,
    'ConferenceInstance_conferenceInstanceId': str,
    'Paper_date': str,
    'Paper_docType': str,
    'Paper_doi': str,
    'FieldOfStudy_fieldOfStudyId': str,
    'Paper_issue': str,
    'Journal_normalizedName': str,
    'Journal_issn': str,
    'Journal_publisher': str,
    'Paper_originalTitle': str,
    'Paper_citationCount': int,
    'Paper_estimatedCitation': int,
    'Paper_firstPage': str,
    'Paper_lastPage': str,
    'Paper_publisher': str,
    'Paper_referenceCount': int,
    'Paper_paperTitle': str,
    'Paper_year': int,
    'isQueryPaper': str,
}

### Function to build a citation network from the data
This function also do some network calculations and detect communities

In [72]:
def networkFromQuery(queryID):
#     nodes_file, edges_file, output_file
    networkPath = Path("networks")/("%s.xnet"%queryID)
    if networkPath.is_file():
        return xnet.xnet2igraph(networkPath.absolute())
    
    if(queryID.startswith("DEMO_")):
        # if _DEMO_ is used, the data will be loaded from the demo queries instead
        queryResultsPath = Path("./demo_queries/")
        queryID = queryID.replace("DEMO_","")
    else:
        queryResultsPath = Path("../query-results/")
    
    
    nodes_file = queryResultsPath/("%s.csv"%queryID)
    edges_file = queryResultsPath/("%s_edges.csv"%queryID)
    
    
    global edgesData,nodesData,vertexAttributes,index2ID,graph
    edgesData = pd.read_csv(edges_file)
    nodesData = pd.read_csv(nodes_file, dtype=MAGColumnTypes).dropna(subset=["Paper_year"])
    
    # Replacing NaN for empty string
    for key in MAGColumnTypes:
        if(key in nodesData):
            nodesData[key].fillna("",inplace=True)

    # Generating continous indices for papers
    index2ID  = nodesData["Paper_paperId"].tolist()
    ID2Index = {id:index for index, id in enumerate(index2ID)}


    # Hack to account for 2 degree capitalized "FROM"
    fromKey = "From (Citing)"

    toKey = "To (Cited)"
    
    # Converting edges from IDs to new indices
    # Invert edges so it means a citation between from to to
    edgesZip = zip(edgesData[fromKey].tolist(),edgesData[toKey].tolist())
    edgesList = [(ID2Index[toID],ID2Index[fromID]) for fromID,toID in edgesZip if fromID in ID2Index and toID in ID2Index]

    vertexAttributes = {key:nodesData[key].tolist() for key in nodesData if key in MAGColumnTypes}
    
    for key,data in vertexAttributes.items():
        if (isinstance(data[0],str)):
            vertexAttributes[key] = [sEntry if len(sEntry)>0 else "None" for sEntry in [entry.strip("[]") for entry in data]]
            
    graph = ig.Graph(
        n=len(index2ID),
        edges=edgesList,
        directed=True,
        vertex_attrs=vertexAttributes
    )

    verticesToDelete = np.where(np.array([value=="false" for value in graph.vs["isQueryPaper"]]))[0]
    graph.delete_vertices(verticesToDelete)
    graph.vs["KCore"] = graph.shell_index(mode="IN")
    graph.vs["In-Degree"] = graph.degree(mode="IN")
    graph.vs["Out-Degree"] = graph.degree(mode="OUT")

    if("Paper_year" in graph.vertex_attributes()):
        graph.vs["year"] = [int(year) for year in graph.vs["Paper_year"]]
    else:
        graph.vs["year"] = [int(s[0:4]) for s in graph.vs["date"]]
    
    giantComponent = graph.clusters(mode="WEAK").giant()
    giantCopy = giantComponent.copy()
    giantCopy.to_undirected()
    giantComponent.vs["Community"] = [str(c) for c in giantCopy.community_multilevel().membership]
    xnet.igraph2xnet(giantComponent, networkPath.absolute())
    return xnet.xnet2igraph(networkPath.absolute())

#### Generating and saving the network

In [73]:
g = networkFromQuery(queryID)

#### Getting citation distribution inside the query

In [74]:
citations = np.array(g.degree(mode="IN"))

In [75]:
plt.figure()
logbins = np.logspace(0,np.log10(np.max(citations)),20)
hist, logbins = np.histogram(citations[citations>0], bins=logbins,density=True)
plt.plot(logbins[:-1],hist,"o")
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Received citations")
plt.ylabel("Density")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

#### Top papers according to betweenness centrality

In [76]:
betweenessCentrality = g.betweenness()
papersByBetweenness = pd.DataFrame(zip(g.vs["Paper_originalTitle"],g.vs["Paper_year"],betweenessCentrality),
            columns=["Title","Year","Betweenness"]).sort_values("Betweenness",ascending=False)

In [77]:
with pd.option_context('display.max_colwidth', None):
    display(papersByBetweenness[0:5])

Unnamed: 0,Title,Year,Betweenness
1594,Mammalian Rho GTPases: new insights into their functions from in vivo studies.,2008.0,30960.394304
7696,The spindle-assembly checkpoint in space and time.,2007.0,29280.906265
6839,The endocytic pathway: a mosaic of domains,2001.0,21541.40231
5359,ARF proteins: roles in membrane traffic and beyond.,2006.0,20878.975269
2512,"Boveri revisited: chromosomal instability, aneuploidy and tumorigenesis",2009.0,20347.069128
