In [1]:
import urllib
import os
import requests
import json

from collections import defaultdict, namedtuple
import urllib.request

import numpy as np
import pandas as pd

import jsonlines
import json
import time
from tqdm import tqdm
from numpy import dot
from numpy.linalg import norm
import statistics

In [2]:
def get_paper_by_ids(paperIds):
    r = requests.post(
        'https://api.semanticscholar.org/graph/v1/paper/batch',
        params={'fields': 'citationCount,title,corpusId'},
        json={"ids": paperIds}
    )
    return(r.json())

In [3]:
# Helpers from https://github.com/kwchurch/JSALT_Better_Together/blob/main/src/create_rotation_matrix.py

def record_size_from_dir(dir):
    with open(dir + '/record_size', 'r') as fd:
        return int(fd.read().split('\t')[0])

def map_from_dir(dir):
    fn = dir + '/map.old_to_new.i'
    fn_len = os.path.getsize(fn)
    return np.memmap(fn, dtype=np.int32, shape=(int(fn_len/4)), mode='r')

def embedding_from_dir(dir, K):
    fn = dir + '/embedding.f'
    fn_len = os.path.getsize(fn)
    return np.memmap(fn, dtype=np.float32, shape=(int(fn_len/(4*K)), K), mode='r')

def directory_to_config(dir):
    K = record_size_from_dir(dir)
    return { 'record_size' : K,
             'dir' : dir,
             'map' : map_from_dir(dir),
             'embedding' : embedding_from_dir(dir, K)}

In [4]:
specter = '/Volumes/kwc_JSALT/JSALTdir/semantic_scholar/embeddings/specter'
proposed = '/Volumes/kwc_JSALT/JSALTdir/semantic_scholar/embeddings/proposed'

In [5]:
inputs = specter, proposed

configs = [directory_to_config(d) for d in inputs]

map0 = configs[0]['map']
emb0 = configs[0]['embedding']

map1 = configs[1]['map']
emb1 = configs[1]['embedding']

In [6]:
print(map1[8])

5


In [7]:
t0 = time.time() # measure wall time
print(len(emb1[map1[278919]]))
print(len(emb0[map0[278919]]))
print(format(time.time() - t0, ".3f"), " seconds wall time to fetch specter & proposed.")

280
768
0.068  seconds wall time to fetch specter & proposed.


In [8]:
PATH = '/Users/shabnamtafreshi/Desktop/JSULT/paper_reviewer_matching/'
softFile = 'test_soft_qrel'
hardFile = 'test_hard_qrel'
reviewer_metafile = 'reviewer_metadata'

In [9]:
def gen_corpusId_potential_reviewrs(softFile, hardFile):
    t0 = time.time() # measure wall time
    specter_dataset_local = {}
    proposed_dataset_local = {}
    corpusId_potential_reviewrs = {}
    files = [softFile, hardFile]
    for file in files:
        with jsonlines.open(PATH + file + '.jsonl') as reader:
            for obj in reader:
                if 'icip' not in str(obj["query_id"]) and int(obj["query_id"]) not in specter_dataset_local.keys():
                    specter_dataset_local[obj["query_id"]] = emb0[map0[int(obj["query_id"])].tolist()]
                    proposed_dataset_local[obj["query_id"]] = emb1[map1[int(obj["query_id"])].tolist()]
                if 'icip' not in str(obj["query_id"]):
                    if str(obj["query_id"]) not in corpusId_potential_reviewrs.keys():
                        corpusId_potential_reviewrs[obj["query_id"]] = obj['cand_id']
                    else:
                        corpusId_potential_reviewrs[obj["query_id"]] += ',' + obj['cand_id']
    #print(len(specter_dataset))
    #print(len(proposed_dataset))
    print(format(time.time() - t0, ".3f"), " seconds wall time to fetch specter & proposed.")
    return specter_dataset_local, proposed_dataset_local, corpusId_potential_reviewrs

In [10]:
def save_files(datasource, finename, namextension):
    with open(PATH + finename + namextension + '.json', 'w', encoding='utf-8') as writefile:
        json.dump(datasource, writefile, ensure_ascii=False, indent=4)

In [11]:
def gen_reviewers_hash(reviewer_metafile):
    t0 = time.time() # measure wall time
    r_meta_data = []
    reviewers_hash = {}
    index = 0
    reviewer_readfile = PATH + reviewer_metafile + '.jsonl'
    with jsonlines.open(reviewer_readfile) as readR:
        for obj in readR:
            items = get_paper_by_ids(obj['papers'])
            temp_data = {"r_id": obj["r_id"], 'papers': items}
            r_meta_data.append(temp_data)
            print(index, end='\r')
            index += 1
    save_files(r_meta_data, reviewer_metafile, '_papers_meta')
    for r_meta in r_meta_data:
        reviewers_hash[r_meta['r_id']] = r_meta['papers']
    print(format(time.time() - t0, ".3f"), " seconds wall time to gen reviewers_hash.")
    return reviewers_hash

In [13]:
def gen_peer_review_cosine(corpusId_potential_reviewrs,
                                        reviewers_hash,
                                        specter_dataset,
                                        proposed_dataset):
    t0 = time.time() # measure wall time
    cosine_results_local = []
    index = 0
    print(len(corpusId_potential_reviewrs))
    for corpusId, potential_reviewrs in corpusId_potential_reviewrs.items():
        parts = potential_reviewrs.split(',')
        if index == 16:
            print(f'{index}: {len(parts)}')
        for reviewerID in parts:
            #print(reviewerID)
            specter_cosines = []
            proposed_cosines = []
            counter = 0
            if reviewerID in reviewers_hash.keys():
                for k in range(len(reviewers_hash[reviewerID])):
                    #print(reviewers_hash[reviewerID][k]['corpusId'])
                    if reviewers_hash[reviewerID][k] is not None and int(reviewers_hash[reviewerID][k]['corpusId']) in map0:
                        #print(reviewers_hash[reviewerID][k]['corpusId'])
                        specter_dataset[reviewers_hash[reviewerID][k]['corpusId']] = emb0[map0[int(reviewers_hash[reviewerID][k]['corpusId'])].tolist()]
                        proposed_dataset[reviewers_hash[reviewerID][k]['corpusId']] = emb1[map1[int(reviewers_hash[reviewerID][k]['corpusId'])].tolist()]
                        # Calculating cosign similarity
                        specter_cosines.append(dot(specter_dataset[corpusId], specter_dataset[reviewers_hash[reviewerID][k]['corpusId']])/(norm(specter_dataset[corpusId])*norm(specter_dataset[reviewers_hash[reviewerID][k]['corpusId']])))
                        proposed_cosines.append(dot(proposed_dataset[corpusId], proposed_dataset[reviewers_hash[reviewerID][k]['corpusId']])/(norm(proposed_dataset[corpusId])*norm(proposed_dataset[reviewers_hash[reviewerID][k]['corpusId']])))
                        counter += 1

                if len(specter_cosines) < 3:
                    num_of_papers = len(specter_cosines)
                else:
                    num_of_papers = 3

                if len(specter_cosines) == 0:
                    mean_specter_cosines = 0
                    mean_proposed_cosines = 0

                else:
                    mean_specter_cosines = statistics.mean(sorted(specter_cosines[:num_of_papers]))
                    mean_proposed_cosines = statistics.mean(sorted(proposed_cosines[:num_of_papers]))
                cosine_results_local.append({'corpusId': corpusId, 'r_id': reviewerID,
                                  'specter_score': str(mean_specter_cosines), 'proposed_score': str(mean_proposed_cosines),
                                  'numOfPapers': counter, 'numOfPapersWEmbed': num_of_papers})
        print(index, end='\r')
        index += 1
    print(format(time.time() - t0, ".3f"), " seconds wall time to gen gen_peer_review_cosine.")
    return cosine_results_local

In [None]:
specter_dataset, proposed_dataset, corpusId_potential_reviewrs = gen_corpusId_potential_reviewrs(softFile, hardFile)
reviewers_hash = gen_reviewers_hash(reviewer_metafile)
cosine_results = gen_peer_review_cosine(corpusId_potential_reviewrs,
                                        reviewers_hash,
                                        specter_dataset,
                                        proposed_dataset)
save_files(cosine_results, reviewer_metafile, '_papers_meta_scores')

1.172  seconds wall time to fetch specter & proposed.
448.542  seconds wall time to gen reviewers_hash.
34
16