In [3]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import sys
import os
sys.path.append(os.path.abspath("../../"))
from utils import utils

tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)

papers = [{'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'},
          {'title': 'Attention is all you need', 'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'},
          {'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'},
          {'title': 'Attention is all you need', 'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'},
          {'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'},
          {'title': 'Attention is all you need', 'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'},
          {'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'},
          {'title': 'Attention is all you need', 'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'},]



# concatenate title and abstract
text_batch = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in papers]
for data in utils.batch_iterator(text_batch, 2):
    print(data)


# preprocess the input
inputs = tokenizer(text_batch, padding=True, truncation=True,
                                   return_tensors="pt", return_token_type_ids=False, max_length=512)
output = model(**inputs)
# take the first token in the batch as the embedding
embeddings = output.last_hidden_state[:, 0, :]

print(len(embeddings[0]))

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 22280.50it/s]


['BERT[SEP]We introduce a new language representation model called BERT', 'Attention is all you need[SEP]The dominant sequence transduction models are based on complex recurrent or convolutional neural networks']
['BERT[SEP]We introduce a new language representation model called BERT', 'Attention is all you need[SEP]The dominant sequence transduction models are based on complex recurrent or convolutional neural networks']
['BERT[SEP]We introduce a new language representation model called BERT', 'Attention is all you need[SEP]The dominant sequence transduction models are based on complex recurrent or convolutional neural networks']
['BERT[SEP]We introduce a new language representation model called BERT', 'Attention is all you need[SEP]The dominant sequence transduction models are based on complex recurrent or convolutional neural networks']
768


In [None]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import sys
import os
sys.path.append(os.path.abspath("../../"))
from utils import utils
from eval.retrieval.kv_store import KVStore
from eval.retrieval.kv_store import TextType
from typing import List, Any
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)

def encode_batch(texts: List[str], type: TextType, show_progress_bar: bool = True, batch_size: int = 256) -> List[Any]:
        encoded_keys = []
        total_batches = utils.batch_size_calc(texts, batch_size)

        iterator = utils.batch_iterator(texts, batch_size)
        if show_progress_bar:
            iterator = tqdm(iterator, total=total_batches, desc="Encoding Batches")

        for text_batch in iterator:
            inputs = tokenizer(text_batch, 
                                        padding=True, 
                                        truncation=True,
                                        return_tensors="pt", 
                                        return_token_type_ids=False,
                                        max_length=512)
            output = model(**inputs)
            embeddings = output.last_hidden_state[:, 0, :].detach().clone().requires_grad_(False)
            encoded_keys.extend(embeddings)
        return encoded_keys



temp_text = [
    {"corpusid": 123, "title": "1", "abstract": "1"},
    {"corpusid": 238, "title": "2", "abstract": "2"},
    {"corpusid": 124, "title": "3", "abstract": "3"},
    {"corpusid": 12095483, "title": "4", "abstract": "4"}
]

kv_pairs = {utils.get_title_abstract_SEPtoken(record): utils.get_clean_corpusid(record) for record in temp_text}
print(kv_pairs)



# temp = encode_batch(temp_text, TextType.KEY)
# print(len(temp))
# for i in range(len(temp)):
#      print(len(temp[i]))

  from .autonotebook import tqdm as notebook_tqdm
BertAdapterModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 39016.78it/s]


{'1[SEP]1': 123, '2[SEP]2': 238, '3[SEP]3': 124, '4[SEP]4': 12095483}


In [5]:
sep_token = tokenizer.sep_token
print(sep_token)
print(type(sep_token))
print(sep_token[1])

[SEP]
<class 'str'>
S


In [3]:
temp_text = [
    '1[SEP]1',
    '2[SEP]2',
    '3[SEP]3',
    '4[SEP]4',
    '5[SEP]5',
    '6[SEP]6'
]

iterator = utils.batch_iterator(temp_text, 2)
for i in iterator:
    print(i)

['1[SEP]1', '2[SEP]2']
['3[SEP]3', '4[SEP]4']
['5[SEP]5', '6[SEP]6']


In [1]:
import sys
import os
sys.path.append(os.path.abspath("../../"))
from utils import utils
import json
import torch
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from tqdm import tqdm

import ast
from threading import Lock
import pickle
from transformers import AutoTokenizer
from adapters import AutoAdapterModel

class ManualClean():
    def __init__(self, input_folder: str, output_dir: str, model_path: str = "allenai/specter2_base"):
        # input output dir
        self._input_folder = Path(input_folder)
        self._output_dir = output_dir

        #specter v2 model config and load
        self.model_path = model_path
        self._tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        self._model = AutoAdapterModel.from_pretrained(self.model_path)
        self._model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)
        
        #index 
        self.index_name = 'specter2'
        self.index_type = 'specter2'
        self.key_instruction = "Represent the title and abstract of the research paper for retrieval:"
        self.query_instruction = "Represent the research question for retrieving relevant research paper abstracts:"
        self.keys = []
        self.values = []
        self.encoded_keys = []

        #others
        logging.basicConfig(level=logging.INFO)
        self._logger = logging.getLogger(__name__)

    def process_each_file(self, file_path: Path, lock) -> None:
        #key is title and abstract
        #value is corpusid
        #embedding is torch.tensor
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        for item in data:
            value = utils.get_clean_corpusid(data[item])

            #title and abstract
            abstract = utils.get_clean_abstract(data[item])
            try:
                title = utils.get_clean_title(data[item])
            except KeyError as e:
                title = ''
            key = f"Title: {title}\nAbstract: {abstract}"

            #embeddings
            try:
                embedding = ast.literal_eval(data[item]['embedding_v2_vector'])
                embedding = torch.tensor(embedding).requires_grad_(False)
                if len(embedding) != 768:
                    raise KeyError(f"embedding length is not 768")
            except KeyError as e:
                self._logger.error(f'No specter2 embedding for corpusid: {utils.get_clean_corpusid(data[item])}')

                papers = [{'title': title, 'abstract': abstract}]
                with lock:
                    text_batch = [d['title'] + self._tokenizer.sep_token + (d.get('abstract') or '') for d in papers]
                    
                    # preprocess the input
                    inputs = self._tokenizer(text_batch, padding=True, truncation=True,
                                                    return_tensors="pt", return_token_type_ids=False, max_length=512)
                    output = self._model(**inputs)

                # take the first token in the batch as the embedding
                embeddings = output.last_hidden_state[:, 0, :]
                embedding =  embeddings[0].detach().clone().requires_grad_(False)
            
            with lock:
                self.keys.append(key)
                self.values.append(value)
                self.encoded_keys.append(embedding)
    
    def process_all_files(self) -> None:

        lock = Lock()
        
        json_files = list(self._input_folder.glob('*.json'))
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = {
                    executor.submit(self.process_each_file, file_path, lock): file_path
                    for file_path in json_files
                }

            for future in tqdm(as_completed(futures), total=len(json_files), desc='Processing all files'):
                file_path = futures[future]
                try:
                    future.result()
                except Exception as exc:
                    self._logger.error(f'{file_path} generated an exception: {exc}')

    def return_vals(self, save_as_tensor: bool = False) -> None:
        if save_as_tensor:
            self.encoded_keys = torch.stack(self.encoded_keys)
        print(self.keys)
        print(self.values)
        print(self.encoded_keys)
    
    def save(self, save_as_tensor: bool = False) -> None:
        savetype = 'list'
        if save_as_tensor:
            self.encoded_keys = torch.stack(self.encoded_keys)
            savetype = 'tensor'
        save_dict = {}
        for key, value in self.__dict__.items():
            if key[0] != "_":
                save_dict[key] = value

        print(f"Saving index to {os.path.join(self._output_dir, f'{savetype}_{self.index_name}.{self.index_type}')}")
        os.makedirs(self._output_dir, exist_ok=True)
        with open(os.path.join(self._output_dir, f"{savetype}_{self.index_name}.{self.index_type}"), 'wb') as file:
            pickle.dump(save_dict, file, protocol=pickle.HIGHEST_PROTOCOL)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_dir = '/usr/xtmp/hc387/ai_reviewer/LitSearch/eval/temp_testing/temp_folder'
output_dir = '/usr/xtmp/hc387/ai_reviewer/LitSearch/eval/temp_testing/temp_folder'
temp = ManualClean(input_dir, output_dir)
temp.process_all_files()
temp.save(save_as_tensor=True)

BertAdapterModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 25191.02it/s]
ERROR:__main__:No specter2 embedding for corpusid: 1
ERROR:__main__:No specter2 embedding for corpusid: 5
Processing all files:   0%|          | 0/2 [00:00<?, ?it/s]ERROR:__main__:No specter2 embedding for corpusid: 2
ERROR:__main__:No specter2 embedding for corpusid: 6
ERROR:__main__:No sp

Saving index to /usr/xtmp/hc387/ai_reviewer/LitSearch/eval/temp_testing/temp_folder/tensor_specter2.specter2


In [5]:
import pickle

# Path to your pickle file
file_path = '/usr/xtmp/hc387/ai_reviewer/LitSearch/eval/temp_testing/temp_folder/tensor_specter2.specter2'

# Load the pickle file
with open(file_path, 'rb') as file:
    data = pickle.load(file)

# Use the loaded data
print(data['keys'])
print(data['values'])
print(data['encoded_keys'])



FileNotFoundError: [Errno 2] No such file or directory: '/usr/xtmp/hc387/ai_reviewer/LitSearch/eval/temp_testing/temp_folder/tensor_specter2.specter2'

In [3]:
import json
import ast
import torch
path_file = '/usr/xtmp/hc387/ai_reviewer/data/semantic_scholar/2024_10_8/everything_combined_more/file_621.json'

with open(path_file, 'r') as f:
    data = json.load(f)

for item in data:
    embedding = ast.literal_eval(data[item]['embedding_v2_vector'])
    embedding = torch.tensor(embedding).requires_grad_(False)

    print(len(embedding))

768
768
768
768
768
768
768
768
768


In [6]:
import torch

query = torch.tensor([1, 0]).float()

encoded_keys = torch.randn(10, 2)
print(encoded_keys)

x = torch.matmul(encoded_keys, query)

print(x)
print(torch.topk(x, 2))

tensor([[ 0.5656, -0.6054],
        [-0.3655,  0.3068],
        [ 0.7903, -0.1476],
        [-0.2881, -0.4693],
        [-1.4547,  0.5003],
        [ 0.1156,  0.1400],
        [-0.2620,  0.7061],
        [ 2.6374, -0.1449],
        [-0.3929,  1.8097],
        [ 1.0943,  0.7799]])
tensor([ 0.5656, -0.3655,  0.7903, -0.2881, -1.4547,  0.1156, -0.2620,  2.6374,
        -0.3929,  1.0943])
torch.return_types.topk(
values=tensor([2.6374, 1.0943]),
indices=tensor([7, 9]))
