# Part3: graph features. 

In this part we exploit the abstract embeddings to build features for the graph.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("classic")
import operator
import functools
import itertools
import dataclasses
import collections
import pickle
import scipy.sparse as sp
from tqdm.autonotebook import tqdm



In [9]:
@dataclasses.dataclass
class ScriptParams():
    embed_p = "abstract_embeddings.npy" #saved abstract embedding path.
    stop_words_p = "stopwords.txt"
    first_pass_p = "abstracts_p1.txt"
    second_pass_p = "abstracts_p2.txt"
    author_paper_pkl = "author_papers.pkl"
    author_index_pkl = "author_index.pkl"
    second_pass_p = "abstracts_p2.txt"
    adjacency_p = "adj.txt"
    authors_p = "authors.txt"
    author_emb_p = "authors_emb.txt"

In [58]:
def abstract_to_author():
  """ 
  build author features from abstracts freatures.
  simply by summing them.
  returns:
  -----
  abstract features: array: author_idx to its embedding obtained by summing its paper embeddings.
  n_papers: array: author_idx to its number of published papers.
  """
  with open(ScriptParams().author_paper_pkl, "rb") as f:
    author_papers = pickle.load(f)
  print("author_paper loaded.")
  
  abstracts_embeddings = np.load(ScriptParams().embed_p)
  n_h = abstracts_embeddings.shape[1]
  abstracts_embeddings = np.concatenate([abstracts_embeddings,np.zeros((1,n_h))], axis=0) # adding zeros at the end: trick to handle missing values.
  print("abstract_embedding loaded.")  
  
  # load the previouly computed text data
  with np.load(ScriptParams().second_pass_p+".npz", allow_pickle=True) as data:
    index_paper = data["authors"]
  print("index_author loaded.")
  
  # map abstract_id to indices, and missing values to -1; -1 will be the column of zeros
  mapping = collections.defaultdict(lambda :-1, zip(index_paper, range(len(index_paper))))
  to_idx = np.vectorize(mapping.__getitem__)
  offsets = np.cumsum(list(map(len, author_papers.values()))) #
  idxs = to_idx(np.concatenate(list(author_papers.values()))) # compute the mapping on all papers
  new_vals = np.array(list(map(functools.partial(np.sum, axis=0), np.split(abstracts_embeddings[idxs], offsets[:-1])))) # split back to each author and sums its papers embeddings.
  return new_vals, np.diff(offsets, prepend=0)

In [59]:
author_embs, authors_npapers = abstract_to_author()

author_paper loaded.
abstract_embedding loaded.
index_author loaded.
(231239,)
231239
231239


In [66]:
np.savez(ScriptParams().author_emb_p, author_embs = author_embs, author_npaper = authors_npapers)