In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
import gc
import math
import json
import torch
import time
import torch.nn as nn

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
from transformers import *

tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', output_attentions=False, output_hidden_states=False)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/snapshots/24f92d32b1bfb0bcaf9ab193ff3ad01e87732fc1/config.json
Model config BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31090
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--allenai--scibert_scivocab_uncased/snapshots/24f92d32b

In [13]:
def compute_rankings(ip_dir, file, max_length):
    '''
        function to compute rankings based on ratings using cosine similarity between query facet text and corresponding candidate facet text.
    '''
    input_dir = ip_dir
    filename = file
    query_facet = filename.split('-')[1]
    query_dicts = dict()
    candidate_dicts = list()
    rankings = dict()
    max_length = max_length

    with open(input_dir + filename, 'r') as f:
        annot = json.load(f)
        query_dict = annot[0]
        candidate_dicts = annot[1:]
        del annot                        # delete unnecessary files using del and gc.collect() to save RAM
        gc.collect()
        
        try:       # needed because some files don't have query facet text for the requested query
            query_text = query_dict[query_facet + '_label']        # query facet text
            candi_texts = list()                                   # corresponding candidate facet texts
            candi_ratings = list()
            candidate_relevances = list()
            for candi_dict in candidate_dicts:
                try:
                    candi_texts.append(candi_dict[query_facet+'_label'])
                except:
                    candi_texts.append('')
                candidate_relevances.append(candi_dict['adju_relevance'])

            del query_dict, candidate_dicts
            gc.collect()

            # important! grad accumulation not needed because we're doing inference
            # without it, RAM gets eaten up pretty quickly
            with torch.no_grad():     
                query_encoding = tokenizer(query_text, max_length=max_length, padding='max_length', return_tensors='pt')
                query_dense_output = model(**query_encoding)['pooler_output'].flatten()
                csim = nn.CosineSimilarity(dim=0)

                for text in candi_texts:
                    encoding = tokenizer(text, max_length=max_length, padding='max_length', return_tensors='pt')
                    output = model(**encoding)['pooler_output'].flatten()
                    rating = csim(query_dense_output, output)        # compute cosine similarity
#                     print(rating)
                    candi_ratings.append(rating)

                sorted_indices = torch.argsort(torch.tensor(candi_ratings), descending=True).numpy()
                assert len(sorted_indices) == len(candidate_relevances)
                rankings = {i: int(docid) for i, docid in enumerate(sorted_indices)}
                candi_relevances = [candidate_relevances[int(docid)] for docid in sorted_indices]    # compute ground truth relevance values for the ranked docs

                return rankings, candi_relevances
        except:
            print("Facet : {}, Dict : {}".format(query_facet, query_dict.keys()))
            return None

In [None]:
input_dir = '/kaggle/input/parsed-annotations/parsed_annotations/'
max_length = 400
files_parsed = 0     # to keep track of files parsed till now; to prevent code from starting over again

all_files  = os.listdir(input_dir)
filenames = [file for file in all_files if os.path.isfile(os.path.join(input_dir, file))]
try:
    with open('ranking_relevances.json', 'r') as f:
        results = json.load(f)
except:
    results = dict()
start = time.time()

file_names = filenames[files_parsed:]          # added so that when cell restarted, code parses only the files left, not the ones that were parsed earlier
for i, filename in enumerate(file_names):
    paper_id = filename.split('-')[0]
    facet = filename.split('-')[1]
    try:
        rankings, relevances = compute_rankings(input_dir, filename, max_length)
        results[paper_id+'_'+facet] = {'rankings':rankings, 'relevances':relevances}
    except KeyboardInterrupt:          # when cell stopped, results dict() in its current state will be saved into file
        with open('ranking_relevances.json', 'w') as f:
            json.dump(results, f)
            break
    except Exception as e:
        print(e)
        results[paper_id+'_'+facet] = None
        
    end = time.time()
    files_parsed+=1
    print("{} papers analyzed, Time : {}".format(files_parsed, end-start))

1 papers analyzed, Time : 94.2567412853241
2 papers analyzed, Time : 162.69917726516724
3 papers analyzed, Time : 237.49411964416504
4 papers analyzed, Time : 416.7727208137512
5 papers analyzed, Time : 599.7255144119263
6 papers analyzed, Time : 675.973653793335
Facet : result, Dict : dict_keys(['paper_id', 'title', 'background_label', 'abstract'])
cannot unpack non-iterable NoneType object
7 papers analyzed, Time : 676.5959901809692
Facet : result, Dict : dict_keys(['paper_id', 'title', 'background_label', 'abstract'])
cannot unpack non-iterable NoneType object
8 papers analyzed, Time : 677.2151629924774
9 papers analyzed, Time : 760.2884352207184
10 papers analyzed, Time : 838.6516993045807
11 papers analyzed, Time : 913.9706315994263
12 papers analyzed, Time : 995.2099525928497
Facet : method, Dict : dict_keys(['paper_id', 'title', 'background_label', 'abstract'])
cannot unpack non-iterable NoneType object
13 papers analyzed, Time : 995.8408181667328
14 papers analyzed, Time : 1067

In [None]:
print(results)

In [15]:
!rm ranking_relevances.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# with open('ranking_relevances.json', 'w') as f:
#     json.dump(results, f)

In [9]:
with open('ranking_relevances.json', 'r') as f:
    results = json.load(f)
    print(results)

{'11844559': None, '6173686': None, '53082542': None, '11629674': None, '10695055': {'rankings': {'0': 58, '1': 14, '2': 99, '3': 45, '4': 0, '5': 1, '6': 204, '7': 59, '8': 15, '9': 177, '10': 82, '11': 85, '12': 185, '13': 2, '14': 46, '15': 23, '16': 144, '17': 219, '18': 217, '19': 130, '20': 125, '21': 128, '22': 127, '23': 47, '24': 88, '25': 221, '26': 119, '27': 13, '28': 195, '29': 56, '30': 224, '31': 212, '32': 118, '33': 57, '34': 192, '35': 113, '36': 171, '37': 84, '38': 112, '39': 95, '40': 143, '41': 208, '42': 25, '43': 102, '44': 31, '45': 230, '46': 156, '47': 44, '48': 111, '49': 199, '50': 153, '51': 24, '52': 52, '53': 169, '54': 207, '55': 229, '56': 165, '57': 148, '58': 123, '59': 176, '60': 76, '61': 136, '62': 138, '63': 103, '64': 237, '65': 172, '66': 182, '67': 147, '68': 214, '69': 110, '70': 6, '71': 4, '72': 62, '73': 174, '74': 35, '75': 9, '76': 36, '77': 191, '78': 223, '79': 8, '80': 141, '81': 198, '82': 30, '83': 133, '84': 235, '85': 97, '86': 75

In [None]:
import gc

del results
gc.collect()