In [52]:
import csv
import itertools
from collections import Counter
import networkx as nx
import numpy as np

In [7]:
# paper citation graph
G = nx.read_edgelist('../data/initial_data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)

# paper - authors ids dict
paper_authors = dict()
with open('../data/processed_data/authors_ids.txt', 'r') as f:
    for line in f:
        node, node_authors = line.rstrip('\n').split('|--|')
        paper_authors[int(node)] = list(map(int,node_authors.split(',')))


In [4]:
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

Number of nodes: 138499
Number of edges: 1091955


## Author collaboration network

We want to create an undirected graph of authors, where two authors are connected by an edge with weight $k$ if there are $k$ papers that they co-authored.

In [56]:
author_collabs = []
for paper in paper_authors:
    # Create tuples of author collaborations for one paper
    # itertools.combinations(p, r) creates r-length tuples, in sorted order, no repeated elements
    # e.g. : list(itertools.combinations('ABC', 2)) >>> [('A', 'B'), ('A', 'B'), ('B', 'C')] 
    author_collabs += list(itertools.combinations(paper_authors[paper], r=2))

In [119]:
print(len(author_collabs))
# We sort each pair of collab because we consider  that 
# a collab (author_1, author_2) is the same as a collab (author_2, author_1)
author_collabs = list(map(sorted, author_collabs))
# The result of sorted is a list so we put it back as a tuple
author_collabs = list(map(tuple, author_collabs))
print(len(author_collabs))

734098
734098


In [120]:
count_author_collabs = Counter(author_collabs)
print(len(count_author_collabs))

529595


In [121]:
# Write the collaborations in a file where each line 'author_1,author_2,n_collabs'
# means that author_1 and author_2 co-authored n_collabs papers
with open("../data/processed_data/author_collab_edgelist.txt", 'w+') as f:
    for (author_1, author_2), weight in count_author_collabs.items():
        f.write(f"{author_1},{author_2},{weight}\n")

In [122]:
with open("../data/processed_data/id2author.txt", 'r') as f:
    n_authors = len(f.readlines())

# paper collaboration graph
G_author_collab = nx.read_edgelist(
    '../data/processed_data/author_collab_edgelist.txt',
    delimiter=',', 
    create_using=nx.Graph(),
    nodetype=int,
    data=(("weight", int),)
)

# There are authors who never co-authored a paper 
# these authors don't have edges (no collaboration) in the graph
# so we have to add them 
# (we give all authors as paramater, the ones that don't exist yet will be added as single nodes)
G_author_collab.add_nodes_from(range(n_authors))

In [123]:
print(len(G_author_collab.nodes()))
print(len(G_author_collab.edges()))

149682
529595


## Author citation network

We want to create an undirected graph of authors, where two authors are connected by an edge with weight $k$ if there are $k$ papers where either one of them has cited the other one.

In [128]:
author_citations = []
for edge in G.edges():
    # Create tuples of author citations between two papers
    # itertools.product(p, q) creates cartesian product (equivalent to a nested for-loop)
    # e.g. : list(itertools.product('AB', 'CD')) >>> [('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D')] 
    author_citations += list(itertools.product(paper_authors[edge[0]], paper_authors[edge[1]]))

In [130]:
print(len(author_citations))
# We sort each pair of citation because we consider  that 
# a citation (author_1, author_2) is the same as a citation (author_2, author_1)
author_citations = list(map(sorted, author_citations))
# The result of sorted is a list so we put it back as a tuple
author_citations = list(map(tuple, author_citations))
print(len(author_citations))

13355619
13355619


In [131]:
count_author_citations = Counter(author_citations)
print(len(count_author_citations))

8050051


In [132]:
# Write the citations in a file where each line 'author_1,author_2,n_citations'
# means that their are n_citations papers where either one of the two author
# has cited the other one.
with open("../data/processed_data/author_citation_edgelist.txt", 'w+') as f:
    for (author_1, author_2), weight in count_author_citations.items():
        f.write(f"{author_1},{author_2},{weight}\n")

In [133]:
with open("../data/processed_data/id2author.txt", 'r') as f:
    n_authors = len(f.readlines())

# author citation graph
G_author_citation = nx.read_edgelist(
    '../data/processed_data/author_citation_edgelist.txt',
    delimiter=',', 
    create_using=nx.Graph(),
    nodetype=int,
    data=(("weight", int),)
)

In [135]:
print(len(G_author_citation.nodes()))
print(len(G_author_citation.edges()))

149682
8050051
