In [1]:
import os
import sys

#Import config file. Update config.py according to your environment
import config

import pandas as pd
import numpy as np

import requests, json

from tqdm import tqdm
import time
from datetime import datetime

In [7]:
start_year = 2022
end_year = 2025
current_year = datetime.now().year
year_list = list(range(start_year,end_year + 1,1))
citecount_list = (current_year-2) - np.array(year_list)
citecount_list[citecount_list < 0] = 0
citecount_list
 
fieldsOfStudy = 'Computer Science,Biology'#,Medicine,Physics,Geology,Psychology,Mathematics,Environmental Science,Agricultural and Food Sciences'
fields2return = 'title,citationCount,abstract,venue,authors,publicationDate,fieldsOfStudy'
additional_specs = 'influentialCitationCount,openAccessPdf,references'#,authors.affiliations,references.paperId,embedding.specter_v2'

## Downloading Semantics Scholar dataset

In [None]:
def merge_dicts(dict_list):
    merged = {}
    for d in dict_list:
        for key, value in d.items():
            merged.setdefault(key, []).append(value)  # Collect values as lists
    return merged

data_source = 'semanticsscholar'
metadata_file_path = os.path.join(config.path_to_data, 'metadata')
text_file_path = os.path.join(config.path_to_data, 'text')
Foverwrite = True

for i, (year, minCitationCount) in enumerate(zip(year_list, citecount_list)):
    year_metadata_dir = os.path.join(metadata_file_path, str(year))
    year_text_dir = os.path.join(text_file_path, str(year))
    os.makedirs(year_metadata_dir, exist_ok=True)
    os.makedirs(year_text_dir, exist_ok=True)

    articles = None
    endpoint = f'https://api.semanticscholar.org/graph/v1/paper/search/bulk?fields={fields2return}&fieldsOfStudy={fieldsOfStudy}&minCitationCount={str(minCitationCount)}&year={str(year)}&openAccessPdf'
    dataset_list = None
    waitsec = 1
    while (dataset_list is None or 'total' not in dataset_list.keys()) and waitsec < 100:
        dataset_list = requests.get(endpoint).json()
        if 'total' not in dataset_list.keys():
            print(dataset_list)
        time.sleep(waitsec)
        waitsec += 1

    N = dataset_list['total']
    N_dw = 0
    batch_size = len(dataset_list['data'])
    token = 0
    old_len = 0
    with tqdm(range(N//batch_size + 1), desc=f'Year {year} / {len(year_list)} years ({N} papers on that year)', unit='batch') as pbar:
        for k in pbar:
            if token is not None:
                token_str = f'&token={token}' if k > 0 else ''
                
                dataset_list = None
                waitsec = 1
                while (dataset_list is None or 'total' not in dataset_list.keys() or (dataset_list['token'] is None and k < N//batch_size)) and waitsec < 100:
                    pbar.set_postfix({'status': f'loading batch, attempt #{waitsec}', 'token': token})
                    dataset_list = requests.get(endpoint + token_str).json()
                    if 'total' not in dataset_list.keys() or (dataset_list['token'] is None and k < N//batch_size):
                        print(dataset_list)
                    time.sleep(waitsec)
                    waitsec += 1

                token = dataset_list['token']
                pbar.set_postfix({'status': 'batch loaded', 'token': token})

                df = pd.DataFrame(dataset_list)
                N_dw += len(df)
                df = pd.concat([df.drop(columns=['data','token','total']), pd.json_normalize(df['data'])], axis=1)
                df = pd.concat([df.drop(columns=['authors']), pd.json_normalize(df['authors'].apply(merge_dicts))], axis=1)
                df = df.dropna(subset=['publicationDate','name','fieldsOfStudy']).drop_duplicates(subset='title')
                df = df[df['venue'].str.len()>3]
                df['authorId'] = df['authorId'].apply(lambda x: ','.join([name for name in x if name is not None]))
                df['authorName'] = df['name'].apply(lambda x: ','.join(x))
                df = df.drop(columns='name')
                df['fieldsOfStudy'] = df['fieldsOfStudy'].apply(lambda x: ','.join([name for name in x if name is not None]))
                df['publicationYear'] = year

                if additional_specs:
                    B = len(df)
                    minibatch_size = 500
                    for k in range(B//minibatch_size + 1):
                        r = None
                        waitsec = 1
                        while (r is None or type(r) is not list) and waitsec < 100:
                            try:
                                r = requests.post('https://api.semanticscholar.org/graph/v1/paper/batch',
                                                params={'fields': additional_specs},
                                                json={"ids": df.paperId[k*minibatch_size:min(B,(k+1)*minibatch_size)].to_list()}
                                                ).json()
                            except requests.exceptions.JSONDecodeError:
                                print("Invalid JSON response received. Trying again")
                            time.sleep(waitsec)
                            waitsec += 1

                        r = [doc for doc in r if doc is not None]
                        df_temp = pd.DataFrame(r)
                        df_temp['openAccessPdf'] = df_temp['openAccessPdf'].apply(lambda x: x['url'] if x is not None else x)
                        df_temp['referenceIds'] = df_temp['references'].apply(lambda x: ';'.join([ref['paperId'] for ref in x if ref['paperId'] is not None]))
                        df_temp['referenceTitles'] = df_temp['references'].apply(lambda x: ';'.join([ref['title'].replace(';', ',') for ref in x if ref if ref['title'] is not None]))
                        df_temp = df_temp.drop(columns=['references'])
                        df_temp['influentialCitationCount'] = df_temp['influentialCitationCount'].astype(float).fillna(0)
                        if k == 0:
                            df_specs = df_temp
                        else:
                            df_specs = pd.concat([df_specs, df_temp], axis=0)
                    
                    df = pd.merge(left=df, right=df_specs, how='left', left_on='paperId', right_on='paperId')
                    df['influentialCitationCount'] = df['influentialCitationCount'].fillna(0)
                    df['openAccessPdf'] = df['openAccessPdf'].fillna('None')

                    df = df[['paperId', 'title', 'abstract', 'openAccessPdf', 'fieldsOfStudy', 'venue', 'authorName', 'authorId', 'citationCount', 'influentialCitationCount', 'publicationDate', 'publicationYear', 'referenceIds', 'referenceTitles']]
                    pbar.set_postfix({'status': 'additional specs loaded', 'token': token})

                else:
                    df = df[['paperId', 'title', 'abstract', 'fieldsOfStudy', 'venue', 'authorName', 'authorId', 'citationCount', 'publicationDate', 'publicationYear']]

                if articles is None:
                    articles = df
                else:
                    articles = pd.concat([articles, df], axis=0)
                    articles = articles.drop_duplicates(subset='paperId')

                new_articles = len(articles) - old_len
                old_len = len(articles)
                pbar.set_postfix({'status': f"{new_articles} articles added", 'token': token})
            else:
                print(f"end of dataset at batch {k} / {N//batch_size} token: {token}")
                if k < N//batch_size - 1:
                    pbar.close()
                    raise ValueError("Returned token is None before end of dataset")
            
    articles = articles.reset_index(drop=True)
    str_col = articles.columns[articles.dtypes==object]
    for col in str_col:
        articles[col] = articles[col].astype(str).str.strip().str.encode('utf-8', 'ignore').str.decode('utf-8')

    print(f'Total number of articles for {year}: {len(articles)} ({N_dw} downloaded)')

    year_metadata_file_path = os.path.join(year_metadata_dir, f'{data_source}_metadata_{year}.parquet')
    year_text_file_path = os.path.join(year_text_dir, f'{data_source}_text_{year}.parquet')
    
    metadata = articles.drop(columns=['title', 'abstract'])
    metadata.to_parquet(year_metadata_file_path, engine="pyarrow", compression="snappy", index=True)

    text = articles[['paperId', 'title', 'abstract', 'referenceTitles', 'openAccessPdf']]
    text.to_parquet(year_text_file_path, engine="pyarrow", compression="snappy", index=True)

Year 2022 / 4 years (428586 papers on that year): 100%|██████████| 429/429 [3:57:07<00:00, 33.16s/batch, status=426 articles added, token=None]                                                                                                                                  


Total number of articles for 2022: 311076 (428598 downloaded)


Year 2023 / 4 years (578172 papers on that year):  27%|██▋       | 156/579 [1:06:57<2:55:24, 24.88s/batch, status=batch loaded, token=PCOKWVSKJJGM4TWNJNI3EUSQJIWVNUSRKBFEYK2JFUBHFI4VJRGM2DKTSPGCYDJNRQGAZEYMQFKGFIUJRGSUSUTSSKAZDILBSKMWTITBLJJKFCKSNQWQBYDSCPRA]              

In [4]:
token

'PCOA3RZLB2ADADAA2CVSZVMIMW2IK4IVQLUNPEBAABD3QOZ464PFBMJ5B2D2KQC4GAKJBPHD7S5UEUVAU52K3VOCCTDYMHW2NBBO4PDXJ4LKFKAM3P5QCCXUCTIA'

## Download PDFs and extract text

In [142]:
data_path = os.path.join(config.path_to_data, 'semnaticsscholar_')
metadata = pd.read_parquet(data_path + "metadata.parquet", engine="pyarrow")
textdata = pd.read_parquet(data_path + "text.parquet", engine="pyarrow")

display(metadata)
display(textdata)

Unnamed: 0,paperId,openAccessPdf,fieldsOfStudy,venue,authorName,authorId,citationCount,influentialCitationCount,publicationDate,publicationYear
0,0001d5fbff6f7c763a78dd5d141292416dcfae59,https://aacrjournals.org/clincancerres/article...,"Biology,Medicine",Clinical Cancer Research,"T. Ishibe,T. Nakayama,T. Okamoto,T. Aoyama,K. ...","14839966,10239302,2000448727,3201672,7147569,2...",88,6.0,2005-04-01,2005
1,000372290caae5482dcdd7954feb895ccbab921d,http://www.merl.com/publications/docs/TR2005-0...,"Computer Science,Mathematics",IEEE Transactions on Communications,"Juntan Zhang,M. Fossorier",319335048891339,319,26.0,2005-03-07,2005
2,000376ec6f6ade6261fc12df9eddd25a88c1e9ca,https://www.jstage.jst.go.jp/article/bpb/28/8/...,Medicine,Biological and Pharmaceutical Bulletin,"Bi-qi Zhang,Shen-Jiang Hu,Lihong Qiu,Q. Shan,J...","11809650,6617669,123635715,11305673,2152146096...",24,2.0,2005-08-01,2005
3,0003a5747ee3676be1b6d2c72c13a04e41811274,https://onlinelibrary.wiley.com/doi/pdfdirect/...,"Biology,Medicine",Developmental Dynamics,"Barden Chan,S. Sinha,Dan Cho,R. Ramchandran,V....",356909232114125286211325257421347733658233,53,2.0,2005-01-01,2005
4,00043e28cffe744909bf69cde70e6dac0bce0a10,http://www.jbc.org/article/S0021925820589394/pdf,"Chemistry,Medicine",Journal of Biological Chemistry,"M. Westfall,A. M. Lee,D. Robinson",50259834924991148706510,70,3.0,2005-12-16,2005
...,...,...,...,...,...,...,...,...,...,...
3737188,fea6042ae3d4b2e873426e75ef4e3908fefdf2e7,http://arxiv.org/pdf/2501.11288,Computer Science,IEEE transactions on consumer electronics,"Yanchao Wang,Dawei Zhang,Run Li,Zhonglong Zhen...","2298569900,2275025303,2298564302,2278244261,23...",0,0.0,2025-01-20,2025
3737189,fed4e1c03728d1747984ccc035e9e52d4f576666,https://www.biorxiv.org/content/biorxiv/early/...,Biology,bioRxiv,"Niko Kasalo,Mirjana Domazet-Lošo,Tomislav Doma...",214555818114141043121397755049,2,0.0,2025-01-16,2025
3737190,fee975b0352fbf35fdc1926b25a16d8531ebb4e6,https://www.researchsquare.com/article/rs-4129...,Computer Science,Multim. Syst.,"Liang Yang,Qi Yang,Jingjie Zeng,Tao Peng,Zhiha...","2143920912,2308550856,2000361118,2068930271,15...",0,0.0,2025-02-06,2025
3737191,ff8fd5f30461bb788c6a32f37fe0f7821eafebf6,http://arxiv.org/pdf/2501.00799,"Computer Science,Mathematics",IEEE Signal Processing Letters,"S. Mukhopadhyay,Debasmita Mukherjee",462876652338269614,0,0.0,2025-01-01,2025


Unnamed: 0,paperId,title,abstract,openAccessPdf
0,0001d5fbff6f7c763a78dd5d141292416dcfae59,Disruption of Fibroblast Growth Factor Signal ...,Purpose: Synovial sarcoma is a soft tissue sar...,https://aacrjournals.org/clincancerres/article...
1,000372290caae5482dcdd7954feb895ccbab921d,Shuffled iterative decoding,Shuffled versions of iterative decoding of low...,http://www.merl.com/publications/docs/TR2005-0...
2,000376ec6f6ade6261fc12df9eddd25a88c1e9ca,Diphasic effects of Astragalus membranaceus BU...,This study was designed to investigate the eff...,https://www.jstage.jst.go.jp/article/bpb/28/8/...
3,0003a5747ee3676be1b6d2c72c13a04e41811274,Critical roles of CD146 in zebrafish vascular ...,"In this report, we use zebrafish as a model sy...",https://onlinelibrary.wiley.com/doi/pdfdirect/...
4,00043e28cffe744909bf69cde70e6dac0bce0a10,Differential Contribution of Troponin I Phosph...,Cardiac troponin I is a phosphorylation target...,http://www.jbc.org/article/S0021925820589394/pdf
...,...,...,...,...
3737188,fea6042ae3d4b2e873426e75ef4e3908fefdf2e7,PD-SORT: Occlusion-Robust Multi-Object Trackin...,Multi-object tracking (MOT) is a rising topic ...,http://arxiv.org/pdf/2501.11288
3737189,fed4e1c03728d1747984ccc035e9e52d4f576666,Massive outsourcing of energetically costly am...,Animals are generally capable of synthesizing ...,https://www.biorxiv.org/content/biorxiv/early/...
3737190,fee975b0352fbf35fdc1926b25a16d8531ebb4e6,Dialogue sentiment analysis based on dialogue ...,,https://www.researchsquare.com/article/rs-4129...
3737191,ff8fd5f30461bb788c6a32f37fe0f7821eafebf6,Follow the Approximate Sparse Leader for No-Re...,We consider the problem of online sparse linea...,http://arxiv.org/pdf/2501.00799


In [147]:
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine
import multiprocessing
import os
import config
import glob
import re
from scipy import stats


def extract_text_on_fontsize(pdf_path):
    # Step 1: Collect all font sizes
    sizes = []
    for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    if isinstance(text_line, LTTextLine):  # Ensure it's a text line
                        for character in text_line:
                            if isinstance(character, LTChar):
                                sizes.append(character.size)

    if not sizes:
        return ""  # No text found

    # Step 2: Determine the most common font size (mode)
    size_mode = stats.mode(sizes, keepdims=True)[0][0]

    # Step 3: Extract text, keeping only characters near the mode font size
    filtered_text = []
    for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    if isinstance(text_line, LTTextLine):
                        line_text = []
                        for character in text_line:
                            if isinstance(character, LTChar) and (0.99 * size_mode <= character.size <= 1.01 * size_mode):

                                line_text.append(character.get_text())
                            else:
                                line_text.append(' ')
                        line_str = "".join(line_text).strip()
                        # Append line if it contains valid characters
                        if not re.fullmatch(r"\d+", line_str):  # Keeps lines that are NOT just numbers
                            filtered_text.append(line_str)
                        filtered_text.append("\n")  # Preserve line breaks

    final_text = "".join(filtered_text)
    final_text = re.sub(r'\(cid:[^\)]+\)', '', final_text)
    final_text = re.sub(r'\n+', '\n', final_text).strip()  # Replace multiple \n with a single one
    
    return final_text

def find_section(pdf_text, section_name, next_section_names, max_chars=20000):
    section_text = ""
    lower_text = re.sub(r'\s+', ' ', pdf_text).strip()
    lower_text = lower_text.lower()
    # find the start of the section
    start_idx = lower_text.find(section_name.lower())
    
    if start_idx == -1:
        return section_text  # not found
    else:
        start_idx += len(section_name)

    # find the earliest next heading after the start
    possible_indices = []
    for next_heading in next_section_names:
        idx = lower_text.find(next_heading.lower(), start_idx + 1)
        if idx != -1:
            possible_indices.append(idx)

    if possible_indices:
        end_idx = min(possible_indices)
    else:
        # if no next heading is found, limit by max_chars
        end_idx = start_idx + max_chars

    section_text = pdf_text[start_idx:end_idx].strip()
    return section_text

def extract_pdf(pdf_path):
    txt = extract_text_on_fontsize(pdf_path)
    return txt

def extract_text_from_pdfs(pdf_path):
    txt = extract_pdf(pdf_path)
    pdf_abstract = find_section(txt, "abstract", 
                                ["\n\n"], max_chars=3000)
    pdf_introduction = find_section(txt, "introduction", 
                                    ["methods", "methodology", "materials and methods", "results", "discussion"], max_chars=5000)
    pdf_discussion = find_section(txt, "discussion", 
                                ["conclusion", "references", "acknowledgments"], max_chars=5000)
    
    pdf_text = find_section(txt, "", 
                                ["conclusion", "references", "acknowledgments"], max_chars=5000)
    
    paperId = pdf_path.split('/')[-1].split('.')[0]

    return {'paperId': paperId, 'pdf_text': pdf_text, 'pdf_abstract': pdf_abstract, 'pdf_introduction': pdf_introduction, 'pdf_discussion': pdf_discussion}

In [None]:
import requests
import time
import glob
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from urllib3.exceptions import ReadTimeoutError
from selenium.common.exceptions import TimeoutException
from tqdm import tqdm
import shutil

def download_pdfs(inputs):
# Example with ChromeDriver - you'll need to have ChromeDriver installed
    txtdata, downloads_dir = inputs
    service = Service(executable_path="/usr/bin/chromedriver")
    if downloads_dir is None:
        downloads_dir = './data/pdf/downloads'

    output_dir = '/'.join(downloads_dir.split('/')[:-1])

    timeout_seconds = 15
    timeout_startdw_seconds = 2
    block_markers = ["captcha", "verify you are human", "forbidden", "this site can’t be reached"]

    options = webdriver.ChromeOptions()
    prefs = {
        # Disable Chrome's PDF viewer
        "plugins.always_open_pdf_externally": True,
        # Don't prompt for download
        "download.prompt_for_download": False,
        "download.default_directory": downloads_dir,
        "download.directory_upgrade": True
    }
    options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(service=service, options=options)
    s = requests.Session()

    # with tqdm(range(len(txtdata)), desc=f'{len(txtdata)} papers', unit='batch') as pbar:
    #     for k in pbar:
    fraction_print = 0
    text_pdfs = []
    for k in range(len(txtdata)):
        try:
            filename = []
            driver.set_page_load_timeout(timeout_seconds)
            driver.get(txtdata[k]['openAccessPdf'])
            end_time = time.time() + timeout_seconds
            end_time_startdw = time.time() + timeout_startdw_seconds
            page_source = driver.page_source.lower()
            # if not "captcha" in page_source and not "verify you are human" in page_source:
            while True:
                filename = glob.glob(os.path.join(downloads_dir, '*.pdf'))
                time.sleep(0.1)
                if filename and filename[0].split('.')[-1]=='pdf':
                    filename = filename[0]
                    break
                elif any(s.lower() in driver.page_source.lower() for s in block_markers):
                    break
                if time.time() > end_time_startdw and not glob.glob(os.path.join(downloads_dir, '*.crdownload')):
                    print(f"Paper {k}: Download still not started after {timeout_startdw_seconds} seconds.")
                    break
                if time.time() > end_time:
                    print(f"Paper {k}: Download still not finished after {timeout_seconds} seconds.")
                    break

            if filename:
                old_filepath = os.path.join(downloads_dir, filename)
                new_filepath = os.path.join(output_dir, txtdata[k]['paperId'] + ".pdf")
                os.rename(old_filepath, new_filepath)
                pdf_data = extract_text_from_pdfs(new_filepath)
                pdf_data['problem'] = None
                os.remove(new_filepath)
            else:
                pdf_data = {'paperId': txtdata[k]['paperId'], 'pdf_text': None, 'pdf_abstract': None, 'pdf_introduction': None, 'pdf_discussion': None, 'problem': 'captcha'}
        except:
            print(f"Paper {k}: Timeout loading  or web driver issue for {txtdata[k]['openAccessPdf']}")
            pdf_data = {'paperId': txtdata[k]['paperId'], 'pdf_text': None, 'pdf_abstract': None, 'pdf_introduction': None, 'pdf_discussion': None, 'problem': 'broken link'}
            continue
        
        text_pdfs.append(pdf_data)
        fraction_done = k / len(txtdata) * 100
        if fraction_done > fraction_print:
            print(str(fraction_done) + " % of papers done in " + downloads_dir)
            fraction_print = fraction_print + 20

    driver.quit()
    shutil.rmtree(downloads_dir)

    return text_pdfs

In [160]:
import multiprocessing

num_processes = 20
download_basedir = '/home/jul/DST/recoRAG/data/pdf'
batch_size = 1000 // num_processes
textdata_list = [(textdata[['paperId','openAccessPdf']].iloc[k*batch_size:(k+1)*batch_size].to_dict(orient='records'), os.path.join(download_basedir, str(k))) for k in range(num_processes)]
with multiprocessing.Pool(processes=num_processes) as pool:
        results = list(pool.imap(download_pdfs, textdata_list))

Paper 0: Timeout loading  or web driver issue for http://homes.dsi.unimi.it/~ghilardi/allegati/frocos05-I2.pdf
2.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/10
2.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/12
Paper 0: Download still not finished after 15 seconds.
Paper 0: Download still not finished after 15 seconds.
Paper 0: Download still not finished after 15 seconds.
Paper 0: Download still not finished after 15 seconds.
Paper 0: Download still not finished after 15 seconds.
2.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/17
Paper 1: Download still not finished after 15 seconds.
2.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/2
2.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/3
2.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/9
2.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/11
2.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/14
2.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/13
2.0 % of papers done in /home/jul/DST/re

The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/005604bb280bc0c2573426c5498262f1493591de.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/005604bb280bc0c2573426c5498262f1493591de.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/9
Paper 9: Download still not finished after 15 seconds.
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/13
Paper 4: Download still not finished after 15 seconds.
Paper 4: Download still not finished after 15 seconds.


The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/00839b4b2347faad03b41eaadb3191b8c81efada.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/16
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/15
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/18
Paper 5: Download still not finished after 15 seconds.


The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/00839b4b2347faad03b41eaadb3191b8c81efada.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/12
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/14
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/10
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/17
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/5
Paper 12: Download still not finished after 15 seconds.
Paper 8: Download still not finished after 15 seconds.
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/0
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/11
Paper 8: Download still not finished after 15 seconds.
Paper 13: Download still not finished after 15 seconds.
Paper 11: Download still not finished after 15 seconds.
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/3


The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/0010aa2b264826b06d3839c2f139da8286930131.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Paper 10: Timeout loading  or web driver issue for https://thorax.bmj.com/content/thoraxjnl/61/1/3.full.pdf
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/19
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/2
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/6


The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/0010aa2b264826b06d3839c2f139da8286930131.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/1


The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/005ae77005a4138ba8a27cd4c667a237dd268230.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/005ae77005a4138ba8a27cd4c667a237dd268230.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/01ff3e30074e61c4b456be3be166e63dd78465a7.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/8
22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/7
Paper 13: Download still not finished after 15 seconds.


The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/01ff3e30074e61c4b456be3be166e63dd78465a7.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


22.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/4
Paper 17: Timeout loading  or web driver issue for http://schof.colorado.edu/~pao/anonftp/BernsteinLawrencePao_WHC05.pdf
Paper 12: Download still not finished after 15 seconds.
Paper 18: Download still not finished after 15 seconds.
Paper 14: Download still not finished after 15 seconds.
Paper 16: Download still not finished after 15 seconds.
Paper 15: Timeout loading  or web driver issue for https://openaccess.marmara.edu.tr/bitstreams/04ed5615-7602-40b9-9607-e19db937255e/download
Paper 18: Download still not finished after 15 seconds.
Paper 19: Download still not finished after 15 seconds.
Paper 17: Timeout loading  or web driver issue for http://www.csee.wvu.edu/~mvalenti/documents/ValentiChengJSAC2005.pdf
42.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/13
42.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/2
Paper 17: Download still not finished after 15 seconds.
42.0 % of papers done in /home/jul/DST/recoRAG/dat

The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/02ae38fbd436a190b9fccd3cc347aada2502870c.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Paper 15: Timeout loading  or web driver issue for https://www.cosic.esat.kuleuven.be/publications/article-644.pdf
Paper 25: Download still not finished after 15 seconds.
42.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/17


The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/02ae38fbd436a190b9fccd3cc347aada2502870c.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Paper 18: Timeout loading  or web driver issue for http://eprints.maths.manchester.ac.uk/977/1/tisseur2.pdf
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/13
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/2
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/16
42.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/1
Paper 29: Download still not finished after 15 seconds.
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/6
Paper 28: Timeout loading  or web driver issue for https://www.karger.com/Article/Pdf/88642
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/19
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/9
Paper 34: Download still not finished after 15 seconds.
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/4
Paper 35: Download still not finished after 15 seconds.
Paper 30: Timeout loading  or web driver issue for http://learnmem.cshlp.org/content/12/5/450.full.pdf
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/7
62.0

The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/0269f75280b5d5429934c20f617ce556d0dbc8ea.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Paper 34: Download still not finished after 15 seconds.
Paper 34: Download still not finished after 15 seconds.


The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/0269f75280b5d5429934c20f617ce556d0dbc8ea.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Paper 35: Download still not finished after 15 seconds.
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/17
Paper 34: Download still not finished after 15 seconds.
82.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/6
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/0
Paper 39: Download still not finished after 15 seconds.
Paper 37: Download still not finished after 15 seconds.
82.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/16
Paper 39: Download still not finished after 15 seconds.
Paper 32: Download still not finished after 15 seconds.
Paper 35: Download still not finished after 15 seconds.
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/1
Paper 40: Download still not finished after 15 seconds.
Paper 40: Download still not finished after 15 seconds.
Paper 45: Download still not finished after 15 seconds.
Paper 40: Download still not finished after 15 seconds.
82.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/13
Paper 43: Download still not fini

The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/030cf4596df70494483c059c1d1d78c274dcce1e.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/030cf4596df70494483c059c1d1d78c274dcce1e.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


82.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/1
Paper 48: Download still not finished after 15 seconds.
Paper 27: Download still not finished after 15 seconds.


The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/002bc0bcbf3b5f2dc32d0f9690afddeeadebe2f3.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/pdf/002bc0bcbf3b5f2dc32d0f9690afddeeadebe2f3.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Paper 40: Download still not finished after 15 seconds.
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/8
Paper 29: Download still not finished after 15 seconds.
Paper 45: Download still not finished after 15 seconds.
62.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/12
Paper 41: Download still not finished after 15 seconds.
82.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/3
Paper 47: Download still not finished after 15 seconds.
82.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/8
Paper 44: Download still not finished after 15 seconds.
82.0 % of papers done in /home/jul/DST/recoRAG/data/pdf/12
Paper 44: Download still not finished after 15 seconds.
Paper 42: Timeout loading  or web driver issue for https://heart.bmj.com/content/heartjnl/91/10/1366.full.pdf
Paper 45: Download still not finished after 15 seconds.
Paper 46: Download still not finished after 15 seconds.
Paper 49: Download still not finished after 15 seconds.


In [158]:
pdf_texts = pd.DataFrame([pdf for result in results for pdf in result])
display(pdf_texts)
pdf_texts['pdf_text'].isna().sum()

Unnamed: 0,paperId,pdf_text,pdf_abstract,pdf_introduction,pdf_discussion,problem
0,0001d5fbff6f7c763a78dd5d141292416dcfae59,Purpose: Synovial sarcoma is a soft tissue sar...,,,,
1,000372290caae5482dcdd7954feb895ccbab921d,MITSUBISHI ELECTRIC RESEARCH LABORATORIES\nhtt...,Replica shufﬂed versions of interactive decode...,,,
2,000376ec6f6ade6261fc12df9eddd25a88c1e9ca,The roots of Astragalus membranaceus B ...,,,-Dependent Vasomotor Effects by A.\nmembra...,
3,0003a5747ee3676be1b6d2c72c13a04e41811274,"In this report, we use zebraﬁsh as a model sys...",,,,
4,00043e28cffe744909bf69cde70e6dac0bce0a10,Cardiac troponin I is a phosphorylation target...,,,,
...,...,...,...,...,...,...
937,037b3d6c245389f57f418f8b24c4664f28826e86,SPECTRAL APPROXIMATION OF THE HELMHOLTZ EQUATI...,,. Time harmonic wave propagations appear in ma...,d in this paper a complete error analysis\nand...,
938,037b9162bcd5be989ed2497b226f7a3fd9212182,Ca /calmodulin-dependent protein kinase II\n(...,,,,
939,037c108e3832f51114e22839ee7df4a7ed95cd6d,"DANIELA TRISCIUOGLIO, MARIANNA DESIDERI, LUD...",,,,
940,037c98b171306b8b4ea1b9487cd7688d0ef89f12,,,,,captcha


np.int64(286)

In [None]:
#Merging with text database and saving
pdf_texts = pd.DataFrame([pdf for result in results for pdf in result])
textdata_pdf = pd.merge(left=textdata, right=pdf_texts, on='paperId', how='left')
textdata_pdf.to_parquet(file_path + '_text_pdf.parquet', engine="pyarrow", compression="snappy", index=True)

np.int64(39)

In [76]:
pdf_text = extract_text(os.path.join(config.path_to_data, 'pdf', '002dea70291555c99aa2d88b87db7e8d3ccad66a.pdf'))

def process_pdf(pdf_text):
    
    # Find sections. We’ll guess some common next headings for each section:
    abstract_text = find_section(pdf_text, "abstract", 
                                 ["\n\n"], max_chars=3000)
    introduction_text = find_section(pdf_text, "introduction", 
                                     ["methods", "methodology", "materials and methods", "results", "discussion"], max_chars=5000)
    discussion_text = find_section(pdf_text, "discussion", 
                                   ["conclusion", "references", "acknowledgements"], max_chars=5000)
    
    all_text = find_section(pdf_text, " ", 
                                   ["conclusion", "references", "acknowledgements"], max_chars=5000)
    
    return abstract_text, introduction_text, discussion_text, all_text

abstract_text, introduction_text, discussion_text, all_text = process_pdf(pdf_text)

In [124]:
censored_List = ['laboratory', 'received', 'accepted', 'corresponding']
pdf_text = all_text.split('\n') #pdf_text.split('.')
pdf_text = [txt for txt in pdf_text if len(txt.split()) > 3 and not any(censored.lower() in txt.lower().split() for censored in censored_List)]

# k = 0
# while k < len(pdf_text)-1:
#     if len(pdf_text[k].split()) < 150:
#         pdf_text[k] = pdf_text[k] + pdf_text[k+1]
#         pdf_text.pop(k+1)
#     else:
#         k +=1
[txt for txt in pdf_text]

['MOLECULAR AND CELLULAR BIOLOGY, May 2005, p. 4166–4175',
 'Copyright © 2005, American Society for Microbiology. All Rights Reserved.',
 'Vol. 25, No. 10',
 'Neuronal Leucine-Rich Repeat Protein 4 Functions in',
 'Takayoshi Bando,1 Keisuke Sekine,1 Shizuka Kobayashi,3 Ayako M. Watabe,3 Armin Rump,1',
 'Minoru Tanaka,1 Yoshikuni Suda,1 Shigeaki Kato,2 Yoshihiro Morikawa,4',
 'Toshiya Manabe,3 and Atsushi Miyajima1*',
 'Cellular Biosciences, University of Tokyo, Yayoi, Bunkyo-ku, Tokyo 113-0032, Japan; Division of Neuronal',
 'Network, Department of Basic Medical Sciences, Institute of Medical Science, University of Tokyo, 4-6-1',
 'Shirokanedai, Minato-ku, Tokyo 108-8639, Japan3; and Department of Anatomy and Neurobiology,',
 'Wakayama Medical University, 811-1 Kimidera, Wakayama 641-8509, Japan4',
 'Neuronal leucine-rich repeat proteins (NLRRs) are type I transmembrane proteins and expressed in',
 'neuronal tissues, but their function remains unknown. Here, we describe the identiﬁcati

In [126]:
import torch
from transformers import AutoTokenizer, AutoModel
from adapters import AutoAdapterModel

model_name = 'sentence-transformers/all-mpnet-base-v1' #'allenai/specter2'#
batch_size = 128
max_length = 512
Nsamples = len(pdf_text)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if 'sentence-transformer' in model_name:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
elif 'specter2' in model_name:
    tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
    model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
    model.load_adapter(model_name, source="hf", set_active=True)

model.to(device);
for param in model.parameters():
    param.requires_grad = False

if 'sentence-transformer' in model_name:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model_query = AutoModel.from_pretrained(model_name)
elif 'specter2' in model_name:
    tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
    model_query = AutoAdapterModel.from_pretrained('allenai/specter2_base')
    model_query.load_adapter("allenai/specter2_adhoc_query", source="hf", set_active=True)

model_query.to(device);
for param in model_query.parameters():
    param.requires_grad = False

In [127]:
embeddings = []
model.eval()
with torch.no_grad():
    with tqdm(range(Nsamples//batch_size + 1), desc=f'total of {Nsamples} papers', unit='batch') as pbar:
        for i in pbar:
            batch_text = pdf_text[i*batch_size:(i + 1)*batch_size]
            if batch_text:
                batch_tokens = tokenizer(batch_text, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False, max_length=max_length)
                for key in batch_tokens.keys():
                    batch_tokens[key] = batch_tokens[key].to(device)
                
                output = model(**batch_tokens)
                # first token in the batch as the embedding
                if 'pooler_output' in output.keys():
                    embeddings_batch = output.pooler_output.cpu().numpy().astype(np.float32)
                else:
                    embeddings_batch = output.last_hidden_state[:, 0, :].cpu().numpy().astype(np.float32)
                    
                embeddings.append(embeddings_batch)

total of 524 papers: 100%|██████████| 5/5 [00:00<00:00,  5.02batch/s]


In [128]:
embeddings = np.concat(embeddings, axis=0)

In [129]:
#normalize embeddings
embeddings_norm = embeddings / np.linalg.norm(embeddings,axis=-1, keepdims=True)

In [138]:
import numpy as np
import umap
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

# Step 1: Reduce Dimensionality with UMAP (or PCA)
umap_reducer = umap.UMAP(n_components=2, metric='cosine', random_state=42)
reduced_embeddings = umap_reducer.fit_transform(embeddings_norm)

# Step 2: Apply DBSCAN Clustering
dbscan = DBSCAN(eps=0.5, min_samples=25, metric='euclidean')  # Tune eps and min_samples
labels = dbscan.fit_predict(reduced_embeddings)

# Step 3: Identify and Remove Outliers (DBSCAN assigns -1 to outliers)
filtered_embeddings = embeddings_norm[labels != -1]
filtered_labels = labels[labels != -1]
filtered_text = np.array(pdf_text)[labels != -1]

  warn(


In [139]:
print(filtered_text)

['Neuronal Leucine-Rich Repeat Protein 4 Functions in'
 'Neuronal leucine-rich repeat proteins (NLRRs) are type I transmembrane proteins and expressed in'
 'tion of a new member of the NLRR family, NLRR4, and its potential role in long-lasting memory. We generated'
 'ported, such as N-methyl-D-aspartate receptor, calcium-calm-'
 'In this study, we describe a novel type I transmembrane'
 'protein termed NLRR4 which exhibits similarity to neuronal'
 'leucine-rich repeat proteins NLRR1 to NLRR3. NLRR1 and'
 'NLRR2 are expressed in the developing nervous system (4, 14,'
 '28, 29). NLRR3 is induced by brain injury and regulated by'
 'However, their functions remain to be studied. To uncover the'
 'function of NLRR4 in vivo, we generated NLRR4-deﬁcient'
 'dase gene by homologous'
 'normal. These data indicate that NLRR4 is important for'
 'Expression cloning of NLRR4 cDNA. Expression cloning of a cDNA encoding'
 'FIG. 1. Structural properties of NLRR4. (a) The deduced amino acid sequences of

In [117]:
cluster_id=10
for k, txt in enumerate(filtered_text):
    if filtered_labels[k]==cluster_id:
        print(k, sum(labels==cluster_id))
        print(txt)
        print()
        print()


In [36]:
print(' '.join(pdf_text))

MOLECULAR AND CELLULAR BIOLOGY, May 2005, p 2005
Copyright © 2005, American Society for Microbiology  10

Neuronal Leucine-Rich Repeat Protein 4 Functions in
Hippocampus-Dependent Long-Lasting Memory
Takayoshi Bando,1 Keisuke Sekine,1 Shizuka Kobayashi,3 Ayako M  Watabe,3 Armin Rump,1
Minoru Tanaka,1 Yoshikuni Suda,1 Shigeaki Kato,2 Yoshihiro Morikawa,4
Toshiya Manabe,3 and Atsushi Miyajima1*
Laboratory of Cell Growth and Differentiation1 and Laboratory of Neuclear Signaling,2 Institute of Molecular and
Cellular Biosciences, University of Tokyo, Yayoi, Bunkyo-ku, Tokyo 113-0032, Japan; Division of Neuronal
Network, Department of Basic Medical Sciences, Institute of Medical Science, University of Tokyo, 4-6-1
Shirokanedai, Minato-ku, Tokyo 108-8639, Japan3; and Department of Anatomy and Neurobiology,
Wakayama Medical University, 811-1 Kimidera, Wakayama 641-8509, Japan4

Received 11 November 2004/Accepted 13 February 2005

Neuronal leucine-rich repeat proteins (NLRRs) are type I transme

In [50]:
query = "Neuronal leucine-rich repeat proteins (NLRRs) are type I transmembrane proteins and expressed in neuronal tissues"
query_input = tokenizer([query], padding=True, truncation=True,
                              return_tensors="pt", return_token_type_ids=False, max_length=max_length)

for key in query_input.keys():
    query_input[key] = query_input[key].to(device)
with torch.no_grad():
    output = model_query(**query_input)
    if 'pooler_output' in output.keys():
        embs = output.pooler_output.cpu().numpy()
    else:
        embs = output.last_hidden_state[:, 0, :].cpu().numpy()

#normalize embedding
embs_norm = np.median(embeddings, axis=0, keepdims=True)
embs_norm = embs / np.linalg.norm(embs,axis=-1, keepdims=True)

In [51]:
#Compute cos similarity and find topK
K = len(pdf_text)
similarity_matrix = np.einsum('bc,kc->bk', embs_norm, embeddings_norm)
topK_indices = np.argsort(-similarity_matrix, axis=-1)[:,:K]
topK_values = np.take_along_axis(similarity_matrix, topK_indices, axis=-1)

topK_indices = topK_indices[0]
topK_values = topK_values[0]

In [52]:
K

331

In [53]:
for k in topK_indices:
    if len(pdf_text[k].split(' '))>15:
        print(k)
        print(pdf_text[k])

22


In this study, we describe a novel type I transmembrane
protein termed NLRR4 which exhibits similarity to neuronal
leucine-rich repeat proteins NLRR1 to NLRR3
102
 Due
to the similarity to the previously identiﬁed neuronal leucine
rich-repeat proteins NLRR1 to NLRR3 (4, 14, 28, 29), we
named B61 NLRR4
155
 In the adult brain, NLRR4 was strongly
expressed in the hippocampus and weakly in the cerebellum
(Fig
311
 Since NLRR4 is highly expressed in the
hippocampus, it is possible that NLRR4 is involved in learning
and/or memory
324
 Axon of
CA1 pyramidal cells mainly extend to the layer V in entorhinal

cortex and NLRR4 is expressed in CA1, CA3, and dentate
gyrus in the hippocampus and in layers V and VI in the cortex
2
 10

Neuronal Leucine-Rich Repeat Protein 4 Functions in
Hippocampus-Dependent Long-Lasting Memory
Takayoshi Bando,1 Keisuke Sekine,1 Shizuka Kobayashi,3 Ayako M
4
 Here, we describe the identiﬁcation and characteriza-
tion of a new member of the NLRR family, NLRR4, a

In [87]:
filtered_metadata = pd.read_parquet(file_path + '_metadata.parquet', engine="pyarrow", filters=[("citationCount", ">", 100)])
filtered_metadata

Unnamed: 0,paperId,openAccessPdf,fieldsOfStudy,venue,authorName,authorId,citationCount,influentialCitationCount,publicationDate,publicationYear
1,000372290caae5482dcdd7954feb895ccbab921d,http://www.merl.com/publications/docs/TR2005-0...,"Computer Science,Mathematics",IEEE Transactions on Communications,"Juntan Zhang,M. Fossorier",319335048891339,319,26.0,2005-03-07,2005
5,0004b93367db8768d29c7e21bf410d55a30f5205,http://dl.acm.org/ft_gateway.cfm?id=1047480&ty...,Computer Science,Technical Symposium on Computer Science Education,"Susan Bergin,R. Reilly",1459851582688685,287,26.0,2005-02-23,2005
6,0004e56a361394b92b9ac1d15a524ef7ecfe2767,https://www.ahajournals.org/doi/pdf/10.1161/01...,"Medicine,Biology","Arteriosclerosis, Thrombosis and Vascular Biology","E. Egorina,M. Sovershaev,G. Bjørkøy,F. Gruber,...","8887769,4962786,11414707,50148632,33502527,139...",148,2.0,2005-07-01,2005
7,000646db4a4a269f9261f5b285ba1d07ef006809,https://europepmc.org/articles/pmc549252?pdf=r...,"Biology,Medicine",Antimicrobial Agents and Chemotherapy,"T. Levin,B. Suh,P. Axelrod,A. Truant,T. Fekete",242180701941629144104608496112849307125,179,10.0,2005-03-01,2005
23,000fd65de92d766ada2dab424f7f66e526e2f063,https://infoscience.epfl.ch/record/87259/files...,Computer Science,IEEE transactions on circuits and systems for ...,"A. Cavallaro,Olivier Steiger,T. Ebrahimi",14544015223590241681498,120,7.0,2005-10-01,2005
...,...,...,...,...,...,...,...,...,...,...
6964,07eb0e2775f4c22f59c7049474fe631921aaea1a,https://www.jneurosci.org/content/jneuro/26/1/...,"Psychology,Medicine",Journal of Neuroscience,"Julien Voisin,A. Bidet-Caulet,O. Bertrand,P. F...",1440923261401757187351793563195672,153,5.0,2006-01-04,2006
6966,07ec944496ff49fa128b004cf3cdeb302ae2c243,https://europepmc.org/articles/pmc3565218?pdf=...,"Biology,Medicine",Developmental Biology,"C. A. Whittaker,K. Bergeron,J. Whittle,B. Bran...","49692462,66505346,4519819,5628946,34842592,355...",179,13.0,2006-12-01,2006
6969,07ef49f1a7de858badbab788dc93f027fdf91980,https://academic.oup.com/endo/article-pdf/147/...,"Biology,Medicine",Endocrinology,"Wan Huang,N. Dedousis,Archana Bandi,G. Lopasch...",4750458136505434058745648299391397132755,121,3.0,2006-03-01,2006
6975,07f45af2cd5df5ae6795fe1b6c0fabef0d543b63,https://europepmc.org/articles/pmc1797767?pdf=...,"Biology,Medicine",Antimicrobial Agents and Chemotherapy,"R. Shandil,R. Jayaram,P. Kaur,S. Gaonkar,B. L....","152540199,13981673,2192681880,34197093,4017113...",212,16.0,2006-12-04,2006


In [93]:
filtered_ids = filtered_metadata.index.tolist()
filtered_text = pd.read_parquet(file_path + '_text.parquet', engine="pyarrow", filters=[("__index_level_0__", "in", filtered_ids)])
filtered_text

Unnamed: 0,paperId,title,abstract,openAccessPdf
1,000372290caae5482dcdd7954feb895ccbab921d,Shuffled iterative decoding,Shuffled versions of iterative decoding of low...,http://www.merl.com/publications/docs/TR2005-0...
5,0004b93367db8768d29c7e21bf410d55a30f5205,Programming: factors that influence success,"This paper documents a study, carried out in t...",http://dl.acm.org/ft_gateway.cfm?id=1047480&ty...
6,0004e56a361394b92b9ac1d15a524ef7ecfe2767,Intracellular and Surface Distribution of Mono...,Objective—The high and low responder phenomeno...,https://www.ahajournals.org/doi/pdf/10.1161/01...
7,000646db4a4a269f9261f5b285ba1d07ef006809,Potential Clindamycin Resistance in Clindamyci...,ABSTRACT The erm gene product confers clindamy...,https://europepmc.org/articles/pmc549252?pdf=r...
23,000fd65de92d766ada2dab424f7f66e526e2f063,Semantic video analysis for adaptive content d...,We present an encoding framework which exploit...,https://infoscience.epfl.ch/record/87259/files...
...,...,...,...,...
6964,07eb0e2775f4c22f59c7049474fe631921aaea1a,Listening in Silence Activates Auditory Areas:...,Directing attention to some acoustic features ...,https://www.jneurosci.org/content/jneuro/26/1/...
6966,07ec944496ff49fa128b004cf3cdeb302ae2c243,The echinoderm adhesome.,,https://europepmc.org/articles/pmc3565218?pdf=...
6969,07ef49f1a7de858badbab788dc93f027fdf91980,Liver triglyceride secretion and lipid oxidati...,Leptin has potent lipid-lowering effects in pe...,https://academic.oup.com/endo/article-pdf/147/...
6975,07f45af2cd5df5ae6795fe1b6c0fabef0d543b63,"Moxifloxacin, Ofloxacin, Sparfloxacin, and Cip...",ABSTRACT Members of the fluoroquinolone class ...,https://europepmc.org/articles/pmc1797767?pdf=...


In [35]:
print(max_itemsize)
str_col = articles.columns[articles.dtypes==object]

max_len = articles['authorName'].apply(lambda x: len(x.encode('utf-8')[:4785].decode('utf-8')) if isinstance(x, str) else 0)
for i in range(len(articles)):
    if max_len.iloc[i] >=4790:
        print(articles['authorName'].iloc[i])
    # max_bytes = articles[col].apply(lambda x: len(x.encode('utf-8')) if isinstance(x, str) else 0).max()
    # print(articles[col].astype(str).str.strip().str.encode('utf-8', 'ignore').str.decode('utf-8').str.len().max(), max_bytes)

{'title': np.int64(450), 'abstract': np.int64(15000), 'openAccessPdf': np.int64(639), 'fieldsOfStudy': np.int64(91), 'venue': np.int64(301), 'authorName': np.int64(4785), 'authorId': np.int64(3916), 'publicationDate': np.int64(15)}


In [None]:
fields2return = 'influentialCitationCount,authors.affiliations,references.paperId,embedding.specter_v2'
r = requests.post('https://api.semanticscholar.org/graph/v1/paper/batch',
                            params={'fields': fields2return},
                            json={"ids": articles.paperId[:500].to_list()}
                        ).json()

In [None]:
r = requests.post(
    'https://api.semanticscholar.org/graph/v1/paper/batch',
    params={'fields': 'title,abstract,venue,publicationVenue,year,citationCount,influentialCitationCount,openAccessPdf,authors.name,authors.affiliations,authors.paperCount,authors.citationCount,authors.hIndex,embedding.specter_v2,references.paperId'},
    json={"ids": ['10.1016/j.cub.2020.07.006']}
).json()

In [50]:
data_path = os.path.join(config.path_to_data, 'articles_semnatics.h5')
with pd.HDFStore(data_path, mode="r") as store:
    data = store.select("articles/main")#, where="publicationYear == 2022")
display(data)


Unnamed: 0_level_0,title,abstract,openAccessPdf,fieldsOfStudy,venue,authorName,authorId,citationCount,influentialCitationCount,publicationDate,publicationYear
paperId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0001d5fbff6f7c763a78dd5d141292416dcfae59,Disruption of Fibroblast Growth Factor Signal ...,Purpose: Synovial sarcoma is a soft tissue sar...,https://aacrjournals.org/clincancerres/article...,"Biology,Medicine",Clinical Cancer Research,"T. Ishibe,T. Nakayama,T. Okamoto,T. Aoyama,K. ...","14839966,10239302,2000448727,3201672,7147569,2...",88,6.0,2005-04-01,2005
000372290caae5482dcdd7954feb895ccbab921d,Shuffled iterative decoding,Shuffled versions of iterative decoding of low...,http://www.merl.com/publications/docs/TR2005-0...,"Computer Science,Mathematics",IEEE Transactions on Communications,"Juntan Zhang,M. Fossorier",319335048891339,319,26.0,2005-03-07,2005
000376ec6f6ade6261fc12df9eddd25a88c1e9ca,Diphasic effects of Astragalus membranaceus BU...,This study was designed to investigate the eff...,https://www.jstage.jst.go.jp/article/bpb/28/8/...,Medicine,Biological and Pharmaceutical Bulletin,"Bi-qi Zhang,Shen-Jiang Hu,Lihong Qiu,Q. Shan,J...","11809650,6617669,123635715,11305673,2152146096...",24,2.0,2005-08-01,2005
0003a5747ee3676be1b6d2c72c13a04e41811274,Critical roles of CD146 in zebrafish vascular ...,"In this report, we use zebrafish as a model sy...",https://onlinelibrary.wiley.com/doi/pdfdirect/...,"Biology,Medicine",Developmental Dynamics,"Barden Chan,S. Sinha,Dan Cho,R. Ramchandran,V....",356909232114125286211325257421347733658233,53,2.0,2005-01-01,2005
00043e28cffe744909bf69cde70e6dac0bce0a10,Differential Contribution of Troponin I Phosph...,Cardiac troponin I is a phosphorylation target...,http://www.jbc.org/article/S0021925820589394/pdf,"Chemistry,Medicine",Journal of Biological Chemistry,"M. Westfall,A. M. Lee,D. Robinson",50259834924991148706510,70,3.0,2005-12-16,2005
...,...,...,...,...,...,...,...,...,...,...,...
ff32bd5f65db2b468501ee31d43600ef7140c90a,Escherichia coli O157 outbreak associated with...,SUMMARY A family cluster of three cases of Esc...,https://www.cambridge.org/core/services/aop-ca...,"Biology,Medicine",Epidemiology and Infection,"E. Espié,V. Vaillant,P. Mariani‐Kurkdjian,F. G...","32722639,144570095,1395683780,6424901,14229538...",129,2.0,2005-07-22,2005
ff342110ddcce74ca03a8f133a3f3ebd94ae5791,Induction of fibroblast growth factor-9 and in...,Motorcycle exhaust particulates (MEP) contain ...,https://academic.oup.com/toxsci/article-pdf/87...,"Biology,Medicine",Toxicological Sciences,"T. Ueng,Chia-Chi Hung,M. Kuo,P. Chan,Shih-Hsiu...","3850380,2064328719,49201256,87085105,2191091,2...",29,2.0,2005-10-01,2005
ff344be182b082370516662b41ace844028fde1f,A role for the CPF 3'-end processing machinery...,The prevailing view of the RNA polymerase II (...,http://genesdev.cshlp.org/content/19/24/2969.f...,"Medicine,Biology",Genes & Development,"A. Ansari,M. Hampsey",383240902899685,272,11.0,2005-12-15,2005
ff348f4d4f4c0e26ae13329d3e709bb66796ba1c,Direction Selectivity in the Goldfish Tectum R...,Abstract: Responses of direction‐selective (DS...,http://iitp.ru/upload/publications/328/DSGCNYA...,"Biology,Medicine",Annals of the New York Academy of Sciences,"V. Maximov,E. Maximova,P. Maximov",3473460344210306621364,42,3.0,2005-06-01,2005


In [51]:
data['influentialCitationCount'].dtypes

dtype('float64')