### Tutorial 3 - Communities and text data
 - Loading a citation network
 - Use the Bardosova method to extract topics from the network communities
 - Plot word clouds for each community

#### Packages needed

In [22]:
!pip install wordcloud
from tqdm.auto import tqdm
import pandas as pd
import xnetwork as xnet
import numpy as np
import igraph as ig
import sys
import os
import math
import random
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib as mpl
from wordcloud import WordCloud

%matplotlib widget



### Set the query ID here

In [9]:
# queryID = "DEMO_NatureJournals"

queryID = "DEMO_ComplexNetworkField"
# or use the queryID provided by a query, such as
# queryID = "Nature_0e367ff2-8670-454b-9b31-ca4d0d85fe96"

#### Setting up some folders and paths

In [10]:
os.makedirs("networks",exist_ok=True)
os.makedirs("figures", exist_ok=True)

#### Fields available for MAG queries

In [11]:
MAGColumnTypes = {
    "Paper_paperId":int,
    'Affiliation_displayName': str,
    'Author_authorId': str,
    'Author_rank': str,
    'Author_normalizedName': str,
    'Author_lastKnownAffiliationId': str,
    'Paper_bookTitle': str,
    'ConferenceInstance_conferenceInstanceId': str,
    'Paper_date': str,
    'Paper_docType': str,
    'Paper_doi': str,
    'FieldOfStudy_fieldOfStudyId': str,
    'Paper_issue': str,
    'Journal_normalizedName': str,
    'Journal_issn': str,
    'Journal_publisher': str,
    'Paper_originalTitle': str,
    'Paper_citationCount': np.float64,
    'Paper_estimatedCitation': np.float64,
    'Paper_firstPage': str,
    'Paper_lastPage': str,
    'Paper_publisher': str,
    'Paper_referenceCount': np.float64,
    'Paper_paperTitle': str,
    'Paper_year': np.float64,
    'isQueryPaper': str,
}

### Function to build a citation network from the data
This function also do some network calculations and detect communities

In [12]:
def networkFromQuery(queryID):
#     nodes_file, edges_file, output_file
    networkPath = Path("networks")/("%s.xnet"%queryID)
    if networkPath.is_file():
        return xnet.xnet2igraph(networkPath.absolute())
    
    if(queryID.startswith("DEMO_")):
        # if _DEMO_ is used, the data will be loaded from the demo queries instead
        queryResultsPath = Path("./demo_queries/")
        queryID = queryID.replace("DEMO_","")
    else:
        queryResultsPath = Path("../query-results/")
    
    
    nodes_file = queryResultsPath/("%s.csv"%queryID)
    edges_file = queryResultsPath/("%s_edges.csv"%queryID)
    
    
    global edgesData,nodesData,vertexAttributes,index2ID,graph
    edgesData = pd.read_csv(edges_file)
    nodesData = pd.read_csv(nodes_file, dtype=MAGColumnTypes).dropna(subset=["Paper_year"])
    
    # Replacing NaN for empty string
    for key in MAGColumnTypes:
        if(key in nodesData):
            nodesData[key].fillna("",inplace=True)

    # Generating continous indices for papers
    index2ID  = nodesData["Paper_paperId"].tolist()
    ID2Index = {id:index for index, id in enumerate(index2ID)}


    # Hack to account for 2 degree capitalized "FROM"
    fromKey = "From (Citing)"

    toKey = "To (Cited)"
    
    # Converting edges from IDs to new indices
    # Invert edges so it means a citation between from to to
    edgesZip = zip(edgesData[fromKey].tolist(),edgesData[toKey].tolist())
    edgesList = [(ID2Index[toID],ID2Index[fromID]) for fromID,toID in edgesZip if fromID in ID2Index and toID in ID2Index]

    vertexAttributes = {key:nodesData[key].tolist() for key in nodesData if key in MAGColumnTypes}
    
    for key,data in vertexAttributes.items():
        if (isinstance(data[0],str)):
            vertexAttributes[key] = [sEntry if len(sEntry)>0 else "None" for sEntry in [entry.strip("[]") for entry in data]]
            
    graph = ig.Graph(
        n=len(index2ID),
        edges=edgesList,
        directed=True,
        vertex_attrs=vertexAttributes
    )

    verticesToDelete = np.where(np.array([value=="false" for value in graph.vs["isQueryPaper"]]))[0]
    graph.delete_vertices(verticesToDelete)
    graph.vs["KCore"] = graph.shell_index(mode="IN")
    graph.vs["In-Degree"] = graph.degree(mode="IN")
    graph.vs["Out-Degree"] = graph.degree(mode="OUT")

    if("Paper_year" in graph.vertex_attributes()):
        graph.vs["year"] = [int(year) for year in graph.vs["Paper_year"]]
    else:
        graph.vs["year"] = [int(s[0:4]) for s in graph.vs["date"]]
    
    giantComponent = graph.clusters(mode="WEAK").giant()
    giantCopy = giantComponent.copy()
    giantCopy.to_undirected()
    giantComponent.vs["Community"] = [str(c) for c in giantCopy.community_multilevel().membership]
    xnet.igraph2xnet(giantComponent, networkPath.absolute())
    return xnet.xnet2igraph(networkPath.absolute())

#### Generating and saving the network

In [13]:
g = networkFromQuery(queryID)

In [14]:
def lighten_color(color, amount=0.5):
    """
    Lightens the given color by multiplying (1-luminosity) by the given amount.
    Input can be matplotlib color string, hex string, or RGB tuple.

    Examples:
    >> lighten_color('g', 0.3)
    >> lighten_color('#F034A3', 0.6)
    >> lighten_color((.3,.55,.1), 0.5)
    """
    import matplotlib.colors as mc
    import colorsys
    try:
        c = mc.cnames[color]
    except:
        c = color
    c = colorsys.rgb_to_hls(*mc.to_rgb(c))
    return colorsys.hls_to_rgb(c[0], 1 - amount * (1 - c[1]), c[2])


def generateColorFunction(originalColor):
    def lighten_color_func(word, font_size, position, orientation, random_state=None,**kwargs):
        c = lighten_color(originalColor,font_size/200*0.6+0.2+0.4*random.random())
        return (int(c[0]*255),int(c[1]*255),int(c[2]*255))
    return lighten_color_func


def sortByFrequency(arr):
    s = set(arr)
    keys = {n: (-arr.count(n), arr.index(n)) for n in s}
    return sorted(list(s), key=lambda n: keys[n])
 


_styleColors = ["#1f77b4","#ff7f0e","#2ca02c","#d62728","#9467bd","#8c564b","#e377c2","#7f7f7f","#bcbd22","#17becf","#aec7e8","#ffbb78","#98df8a","#ff9896","#c5b0d5","#c49c94","#f7b6d2","#c7c7c7","#dbdb8d","#9edae5"];

maxInternalWords = 10000
maxAllWords = 10000
maxCommunities = 6;

communities = g.vs["Community"]
sortedCommunities = sortByFrequency(communities)[0:maxCommunities]
fig = plt.figure(figsize=(10,2.5*math.ceil(len(sortedCommunities)/2)))
allTitles = "\n".join(g.vs["Paper_originalTitle"])
allFrequencies = WordCloud(max_words=maxAllWords).process_text(allTitles)
amask = np.zeros((500,1000),dtype='B')
amask[:10,:] = 255
amask[-10:,:] = 255
amask[:,:10] = 255
amask[:,-10:] = 255
for index,community in enumerate(sortedCommunities):
    communityColor = (_styleColors[index] if index<len(_styleColors) else "#aaaaaa")
    titles = "\n".join([vertex["Paper_originalTitle"] for vertex in g.vs if vertex["Community"]==community])
    plt.subplot(math.ceil(len(sortedCommunities)/2),2,index+1)
    wc = WordCloud(background_color="white", max_words=maxInternalWords, width=1000,height=500,
        mask=amask,contour_width=10, contour_color=communityColor,random_state=3,color_func=generateColorFunction(communityColor))
    totalTitles = len(titles.split("\n"))
    inCommunityFrequency = wc.process_text(titles)
    relativeFrequencies = {key:frequency/totalTitles/(allFrequencies[key]-frequency+1) for key,frequency in inCommunityFrequency.items() if key in allFrequencies and allFrequencies[key]>frequency}
    wc.generate_from_frequencies(relativeFrequencies)

    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")

plt.tight_layout()
# plt.savefig("wordcloud.pdf")
plt.show()
# plt.close(fig)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [23]:
# # color countries
# from collections import Counter
# community2Index = {community:index for index,(community,_) in enumerate(Counter(g.vs["Community"]).most_common(10))}
# communityColors = [mpl.cm.tab10(community2Index[community]) if community in community2Index else "#888888" for community in g.vs["Community"]]
# # Size of node changes with degree
# node_degrees = np.array(g.degree())
# ig.plot(g,
#     layout="lgl", # lgl, drl, davidson_harel, circle, kamada_kawai, fruchterman_reingold, graphopt
#     vertex_size=5,
#     vertex_color=communityColors,
#     edge_arrow_size=0,
#     )