In [None]:
import yaml
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import networkx as nx
import torch
import json
import torch_geometric
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

# Read config YAML

In [None]:
config_path = "../dblp.yaml"
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

# Parse DBLP pickle file
The DBLP file contains a list of papers with its authors and the abstract. First we gather all papers and all collaborators for each author.

In [None]:
with open(config['dblp_pickle'], "rb") as f:
    dblp_data = pickle.load(f)

In [None]:
author_id = {}
author_papers = {}
author_colabs = {}

# Gather papers of each author
for paper in tqdm(dblp_data):
    for author in paper['authors']:
        author = author.strip()
        if author not in author_id:
            new_id = len(author_id)
            author_id[author] = new_id
        aid = author_id[author]
        if aid not in author_papers:
            author_papers[aid] = []
        author_papers[aid].append({"title": paper['title'], "abstract": paper["abstract"]})

# Find collaborators of each author
for item in tqdm(dblp_data):
    for author in item['authors']:
        author = author.strip()
        aid = author_id[author]
        if aid not in author_colabs:
            author_colabs[aid] = []
        for other in item['authors']:
            other = other.strip()
            oid = author_id[other]
            if oid != aid:
                author_colabs[aid].append(oid)

## Data Cleaning

We filter the authors using a list of authors from previous work.

We keep collaborations that have been repeated for at least 5 times. Also, we keep authors that have at least 10 papers and 3 collaborators.

In [None]:
with open(config['good_authors'], "rb") as f:
    good_authors = pickle.load(f)

for author in author_id.values():
    counter = dict(Counter(author_colabs[author]))
    author_colabs[aid] = [item for item, cnt in counter.items() if cnt > 5]

author_id_temp = {}
for author in good_authors:
    if len(author_colabs[author_id[author]]) >= 3:
        author_id_temp[author] = author_id[author]
author_id = author_id_temp

author_colabs = {id: author_colabs[id] for id in author_id.values()}

id_author = {v:k for k,v in author_id.items()}
filtered_authors = [id for id in author_id.values() if 10 <= len(author_papers[id])]
len(filtered_authors)

## Keyword Extraction

This section finds the universal skill set $S$ using KeyBERT. If this set already exists in the dataset, this section could be omitted.

In [None]:
from keybert import KeyBERT
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

keybert_model = KeyBERT(model='allenai-specter')

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
total_text = []
for author in filtered_authors:
    papers = [item['title'] for item in author_papers[author]]
    total_text.extend(papers)

number_of_keywords_per_paper = 5
paper_keywords = keybert_model.extract_keywords(total_text, top_n=number_of_keywords_per_paper)

with open(config['paper_keywords_path'], "wb") as f:
    pickle.dump(paper_keywords, f)

In [None]:
with open(config['paper_keywords_path'], "rb") as f:
    paper_keywords = pickle.load(f)

counter = Counter()
lemmatizer = WordNetLemmatizer()
for ind, keywords in enumerate(tqdm(paper_keywords)):
    for kw in keywords:
        lemm = lemmatizer.lemmatize(kw[0])
        counter.update([lemm])

total_skills_count = 2000
all_skills = [kw[0] for kw in counter.most_common(total_skills_count)]

# Find skills of each expert

In this section, we concatenate all text (paper titles and abstracts) by each author, and then find TF-IDF values for each author's corresponding text. Then, we assign skills to experts when their TF-IDF values surpass a predefined threshold.

In [None]:
corpus = []
for id in tqdm(filtered_authors):
    author_text = [paper['title'] + " " + paper['abstract'] for paper in author_papers[id]]
    author_text = ' '.join(author_text)
    corpus.append(author_text) 

vectorizer = TfidfVectorizer(vocabulary=all_skills)
vectorizer.fit(corpus)

tfidf_vecs = vectorizer.transform(corpus).toarray()
tfidf_threshold = config['tfidf_threshold']

expert_skills = {}
for ind, item_skills in enumerate(tqdm(tfidf_vecs)):
    id = filtered_authors[ind]
    skills_boolean = (item_skills > tfidf_threshold)    
    skills_list = [int(v) for v in skills_boolean]
    expert_skills[id] = skills_list

Then, we eliminate skills that are too rare or common from the universal skill set.

In [None]:
skill_author = {} # This dict contains skills and authors who possess them.
for ind, skill in enumerate(tqdm(all_skills)):
    skill_author[skill] = []
    
    for author in expert_skills:
        if expert_skills[author][ind] > 0:
            skill_author[skill].append(author)

skill_count = [(k, len(v)) for k,v in skill_author.items()]

# We only keep skills that are possessed by between 20 and 2200 experts.
good_skills = [(skill, all_skills.index(skill)) for (skill, count) in skill_count if 20 < count < 2200]
good_skills_indices = [a[1] for a in good_skills]

# We remove eliminated skills from experts' vectors
all_skills = [all_skills[i] for i in good_skills_indices]
for author in tqdm(expert_skills):
    skills = expert_skills[author]
    skills = [skills[ind] for ind in good_skills_indices]
    expert_skills[author] = skills

# Create Collaboration network

In [None]:
# Create list of edges
edges = []

for i in tqdm(expert_skills):
    for j in author_colabs[i]:
        if j in expert_skills:
            edges.append((i, j))

In [None]:
g = nx.Graph()
g.add_edges_from(edges)
nx.set_node_attributes(g, expert_skills, name="x")

# Save collaboration network to files

In [None]:
with open(config['saving_paths']['graph'], "wb") as f:
    pickle.dump(g, f)
with open(config['saving_paths']['authors_id'], "wb") as f:
    pickle.dump(author_id, f)
with open(config['saving_paths']['all_skills'], "wb") as f:
    pickle.dump(all_skills, f)
with open(config['saving_paths']['vectorizer'], "wb") as f:
    pickle.dump(vectorizer, f)