# AUTHOR & PAPER COLLABORATION NETWORK

The objectives of this notebook are: 
- assign an ID to each author for easier maniplation
- build a dictionnary with authors as keys and the list of their papers as values. 
- buil an author collaboration graph where nodes represent authors and edges the number of time they co-authored a paper (no node if they never co-authored)

In [1]:
import os
import itertools
import numpy as np

import paths # script with all data paths

## 1. Processing of the authors

We will encode the authors names into identificators so that we can easily manipulate them.

In [2]:
if not os.path.isfile(paths.ID_2_AUTHOR_PATH) or \
    not os.path.isfile(paths.PAPER_2_AUTHORS_ID_PATH):
    
    all_authors = set() # set of all the authors in the dataset
    paper_authors = dict() # dictionnary  {paper : list of authors names}
    with open(paths.AUTHORS_PATH, 'r') as f:
        for line in f:
            paper, authors = line.rstrip('\n').split('|--|')
            authors = authors.split(',')
            paper_authors[int(paper)] = authors
            all_authors |= set(authors) # '|' is the union operator


    id2author = dict() # dictionnary {author id : author name}
    author2id = dict() # dictionnary {author name : author id}
    for i, author in enumerate(all_authors):
        id2author[i] = author
        author2id[author] = i

    # Save the (id,name) pairs as a text file
    with open(paths.ID_2_AUTHOR_PATH, 'w+') as f: 
        for id_, author in id2author.items(): 
            f.write(f"{id_},{author}\n") 

    # Create a new file similar to authors.txt but with authors ids instead of their names
    with open(paths.PAPER_2_AUTHORS_ID_PATH, 'w+') as f: 
        for paper, authors in paper_authors.items():
            authors_id = list(map(author2id.get, authors))
            authors_id = list(map(str, authors_id))
            f.write(f"{paper}|--|{','.join(authors_id)}\n")
else:
    print("The files already exist !")

The files already exist !


## 2. Get all the papers of each author

In [3]:
if not os.path.isfile(paths.AUTHOR_ID_2_PAPERS_PATH):
    # {paper : authors ids} dict
    paper_authors = dict()
    with open(paths.PAPER_2_AUTHORS_ID_PATH, 'r') as f:
        for line in f:
            paper, co_authors = line.rstrip('\n').split('|--|')
            paper_authors[int(paper)] = list(map(int,co_authors.split(',')))

    # Build {author id : papers} 
    author_papers = dict()
    for paper, authors in paper_authors.items():
        for author in authors:
            if author in author_papers:
                author_papers[author] += [paper]
            else:
                author_papers[author] = [paper]

    # Create a new file with each line as "author|--|paper1,paper2,..."
    with open(paths.AUTHOR_ID_2_PAPERS_PATH, 'w+') as f: 
        for author, papers in author_papers.items():
            papers = list(map(str, papers))
            f.write(f"{author}|--|{','.join(papers)}\n")
else:
    print("The file already exists !")
    # We just read the file
    author_papers = dict()
    with open(paths.AUTHOR_ID_2_PAPERS_PATH, 'r') as f:
        for line in f:
            author, papers = line.rstrip('\n').split('|--|')
            author_papers[int(author)] = list(map(int,papers.split(',')))

The file already exists !


## 3. Building the author collaboration network

We want to create an undirected graph of authors, where two authors are connected by an edge with weight $k$ if there are $k$ papers that they co-authored.

We first build the adjacency matrix of this graph as a weighted collaboration matrix $W \in \mathbb{R}^{n\times n}$ (n := number of authors) such that for two authors $i$ and $j$:

$$
W_{ij} = \sum_{p \in papers} \frac{\delta^p_i \delta^p_j}{n_p - 1} 
\quad \text{if} \quad  i \neq j \quad\quad \text{and} \quad\quad
W_{ii} = 0 
$$

where $n_p$ is the number of authors of paper $p$ and $\delta^p_i$ = $\mathbf{1}$($i \in$ {authors of $p$}) 

In [4]:
if not os.path.isfile(paths.AUTHORS_EDGELIST_PATH):
    # {paper : authors ids} dict
    paper_authors = dict()
    with open(paths.PAPER_2_AUTHORS_ID_PATH, 'r') as f:
        for line in f:
            paper, co_authors = line.rstrip('\n').split('|--|')
            paper_authors[int(paper)] = list(map(int,co_authors.split(',')))

    # Total number of authors
    with open(paths.ID_2_AUTHOR_PATH, 'r') as f:
        n_authors = len(f.readlines())
    print("Number of authors:", n_authors)

    # Adjacency matrix of our future graph
    author_collab_weights = np.zeros((n_authors, n_authors))

    all_author_collabs = set()
    for paper in paper_authors:
        # Create tuples of author collaborations for one paper
        # itertools.combinations(p, r) creates r-length tuples, in sorted order, no repeated elements
        # e.g. : list(itertools.combinations('ABC', 2)) >>> [('A', 'B'), ('A', 'B'), ('B', 'C')] 
        authors = paper_authors[paper]
        author_collabs = list(itertools.combinations(authors, r=2))
        all_author_collabs |= set(author_collabs)
        for author_1, author_2 in author_collabs:
            author_collab_weights[author_1, author_2] += 1/(len(authors)-1)
            author_collab_weights[author_2, author_1] += 1/(len(authors)-1)


    print("# of collabs before the sort:", len(all_author_collabs))
    # We sort each pair of collab because we consider  that 
    # a collab (author_1, author_2) is the same as a collab (author_2, author_1)
    all_author_collabs = list(map(sorted, all_author_collabs))
    # The result of sorted is a list so we put it back as a tuple
    all_author_collabs = set(map(tuple, all_author_collabs))
    print("# of collabs after the sort:", len(all_author_collabs))


    # Write the collaborations in a file where each line 'author_1,author_2,n_collabs'
    # means that author_1 and author_2 co-authored n_collabs papers
    # NOTE: the graph will not contains authors that never collaborated with anyone
    with open(paths.AUTHORS_EDGELIST_PATH, 'w+') as f:
        for (author_1, author_2) in all_author_collabs:
            weight = author_collab_weights[author_1, author_2]
            f.write(f"{author_1},{author_2},{round(weight,2)}\n")
else:
    print("The file already exist !")

The file already exist !
