In [1]:
import os, glob, shutil
import sys

#Import config file. Update config.py according to your environment
from config import path_to_data

import pandas as pd
import numpy as np
from scipy import stats

import requests, json, re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from urllib3.exceptions import ReadTimeoutError
from selenium.common.exceptions import TimeoutException, WebDriverException

from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextBox

from tqdm import tqdm
import time
from datetime import datetime
import multiprocessing

In [2]:
dataset_source = 'semanticsscholar' #'arxiv'

In [3]:
def load_dataset(dataset_source='semanticsscholar', years=None, data_types=None):
    """Fetch dataset metadata and text data for given years and data types."""
    years = (
        [str(year) for year in years] if isinstance(years, list) else
        [str(years)] if years else
        [year for year in os.listdir(os.path.join(path_to_data, 'metadata')) if year.isdigit()]
    )
    data_types = data_types or ['metadata', 'text']

    load_metadata = 'metadata' in data_types
    load_textdata = 'text' in data_types

    metadata, textdata = [], []
    metadata_years, textdata_years = [], []

    for year in years:
        if load_metadata:
            metadata_path = os.path.join(path_to_data, 'metadata', year, f'{dataset_source}_metadata_{year}.parquet')
            if os.path.isfile(metadata_path):
                metadata.append(pd.read_parquet(metadata_path, engine="pyarrow"))
                metadata_years.append(year)
        
        if load_textdata:
            textdata_path = os.path.join(path_to_data, 'text', year, f'{dataset_source}_text_{year}.parquet')
            if os.path.isfile(textdata_path):
                textdata.append(pd.read_parquet(textdata_path, engine="pyarrow"))
                textdata_years.append(year)
    
    metadata = pd.concat(metadata, axis=0) if metadata else []
    textdata = pd.concat(textdata, axis=0) if textdata else []
    
    msg_parts = []
    if load_metadata:
        msg_parts.append(f'metadata loaded for years: {metadata_years}')
    if load_textdata:
        msg_parts.append(f'text data loaded for years: {textdata_years}')
    
    if msg_parts:
        print("; ".join(msg_parts))
    
    if load_metadata and load_textdata and len(metadata) != len(textdata):
        raise ValueError("Metadata and text data don't have the same length.")
    
    output = (data for data in [metadata, textdata] if len(data) > 0)
    return output

In [None]:
def pdf_main_fontsize(pdf_path):
    sizes = [
        character.size
        for page_layout in extract_pages(pdf_path)
        for element in page_layout if isinstance(element, LTTextContainer)
        for text_line in element if isinstance(text_line, LTTextLine)
        for character in text_line if isinstance(character, LTChar)
    ]
    return stats.mode(sizes, keepdims=True)[0][0] if sizes else 0

def extract_pdf_sections(pdf_path, authorlist, paperId, possible_section_headings=None, size_threshold=None):
    author_last_names = [name.split()[-1] for name in authorlist.split(',')]
    last_section_names = {'acknowledgment', 'acknowledgement', 'acknowlegment', 'reference'}
    month_markers = {'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'november', 'december',
                     'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'nov', 'dec'}
    year_markers = {str(y) for y in range(1900, 2030)}
    section_list = possible_section_headings or {
        'introduction', 'results', 'discussion', 'conclusions', 'methods', 'materials', 'experimental',
        'materials and methods', 'experimental procedure', 'related work', 'i.', 'ii.', 'iii.', 'iv.', 'v.', 'vi.'
    }

    size_threshold = size_threshold if size_threshold is not None else 0.9

    page_width, page_height = 595, 842
    nword_abstract_th, nword_sections_th = 30, 30

    #Determine the most common font size (mode)
    size_mode = pdf_main_fontsize(pdf_path)

    text_blocks, tag_blocks, nwords_in_blocks = [], [], []
    section_to_find, section_heading, tag = 'AUTHORS', 'UNDEFINED', 'UNDEFINED'
    reached_end = False
    for p, page_layout in enumerate(extract_pages(pdf_path)):
        if reached_end:
            break

        for element in page_layout:
            if not isinstance(element, LTTextBox):
                continue

            x0, y0, x1, y1 = element.bbox
            if not (y0 > 0.05*page_height and y1 < 0.95*page_height and x0 > 0.05*page_width and x1 < 0.95*page_width):
                continue

            filtered_text, sizes = [], []
            for text_line in element:
                if isinstance(text_line, LTTextLine):
                    line_text = [character.get_text() for character in text_line]
                    sizes.extend([character.size for character in text_line if isinstance(character, LTChar)])
                    line_str = "".join(line_text).strip()
                    if not re.fullmatch(r"\d+", line_str): # Keeps lines that are NOT just numbers
                        filtered_text.append(line_str)
                    filtered_text.append("\n") # Preserve line breaks

            #joining characters in a single text while removing weird markers generated by pdfminer
            filtered_text = re.sub(r'\(cid:[^\)]+\)', '', "".join(filtered_text).strip())
            word_list = re.split(r'[\n\s]+', filtered_text.lower().strip())
            nwords = len(word_list)

            if any(end_section in ' '.join(word_list[:3]) for end_section in last_section_names):
                reached_end = True
                continue
            
            #removing everything before the author block as well as the correspondance fields
            if p <= 1:
                nauthors_detected = sum(lastname.lower() in ' '.join(word_list) for lastname in author_last_names)
                if nauthors_detected >= 0.5 * len(author_last_names) and y0 > 0.3 * page_height:
                    text_blocks, tag_blocks, nwords_in_blocks = [], [], []
                    section_to_find = 'ABSTRACT'
                    filtered_text = []
                if nauthors_detected > 0 and '@' in filtered_text:
                    filtered_text = []

            #removing blocks likely headers with publication date
            if any([m in word_list for m in month_markers]) and any([y in word_list for y in year_markers]) and nwords < 10:
                continue

            #removing figure captions
            if any([figname in word_list[0] for figname in ['fig', 'figure', 'table', 'image']]):
                continue

            #removing previous block if likely a header but not followed by capitalized paragraph
            if filtered_text and not filtered_text[0].isupper() and nwords_in_blocks and nwords_in_blocks[-1] <= 3:
                text_blocks.pop()
                tag_blocks.pop()
                nwords_in_blocks.pop()
            elif (filtered_text and filtered_text.strip() and filtered_text.strip()[0].isupper() and nwords_in_blocks and 
                  nwords_in_blocks[-1] <= 3 and text_blocks and any(h in re.sub(r'[\n\s]+', ' ', text_blocks[-1].lower()) for h in section_list)):
                section_heading = ''.join([w.upper() for w in re.sub(r'[\d.]', '', text_blocks[-1])])
                if nwords > nword_sections_th:
                    tag_blocks[-1] = section_heading

            if not reached_end and filtered_text and (max(sizes, default=0) >= size_threshold * size_mode or nwords > 50):
                if section_to_find == 'ABSTRACT' and nwords > nword_abstract_th:
                    tag = 'ABSTRACT'
                    if word_list[-1][-1] == '.':
                        section_to_find = 'INTRODUCTION'
                elif section_to_find == 'INTRODUCTION':
                    if nwords > nword_sections_th:
                        tag = 'INTRODUCTION'
                        section_heading, section_to_find = 'INTRODUCTION', 'NEXTHEADING'
                    else:
                        tag = 'UNDEFINED'
                elif section_to_find == 'NEXTHEADING' and nwords > nword_sections_th:
                    tag = section_heading
                
                text_blocks.append(filtered_text)
                tag_blocks.append(tag)
                nwords_in_blocks.append(nwords)

    sections = {'paperId': paperId,
            'pdf_abstract': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'abstract'})),
            'pdf_introduction': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'introduction', 'related work', 'i.', 'ii.'})),
            'pdf_results': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'results', 'experiment', 'i.', 'ii.'})),
            'pdf_discussion': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'discussion', 'conclusion', 'v.', 'vi.'})),
            'pdf_methods': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'methods', 'materials', 'experimental', 'materials and methods', 'experimental procedure'})),
            'pdf_text': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if not any(s in t.lower() for s in {'undefined'})),
            'author_list': authorlist
            }
    
    return sections, text_blocks, tag_blocks

In [None]:
def download_pdfs(inputs):
    #you'll need to have ChromeDriver installed
    txtdata, downloads_dir = inputs
    service = Service(executable_path="/usr/bin/chromedriver")
    downloads_dir = downloads_dir or './data/pdf/downloads'
    os.makedirs(downloads_dir, exist_ok=True)
    output_dir = os.path.dirname(downloads_dir)

    txtdata, downloads_dir = inputs
    service = Service(executable_path="/usr/bin/chromedriver")
    if downloads_dir is None:
        downloads_dir = './data/pdf/downloads'

    output_dir = '/'.join(downloads_dir.split('/')[:-1])
    

    timeout_seconds = 10
    timeout_startdw_seconds = 5
    block_markers = {"captcha", "verify you are human", "forbidden", "this site can’t be reached"}

    options = webdriver.ChromeOptions()
    prefs = {
        "plugins.always_open_pdf_externally": True,
        "download.prompt_for_download": False,
        "download.default_directory": downloads_dir,
        "download.directory_upgrade": True
    }
    options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(service=service, options=options)
    text_pdfs = []
    fraction_print = 0
    processed_list = []

    for k, paper in enumerate(txtdata):
        try:
            custom_download_dir = os.path.join(downloads_dir, paper['paperId'])
            os.makedirs(custom_download_dir, exist_ok=True)

            # Set Chrome's download directory dynamically
            driver.execute_cdp_cmd(
                    "Page.setDownloadBehavior", 
                    {"behavior": "allow",
                    "downloadPath": custom_download_dir}
                    )
            
            clean_url = re.sub(r'^http://', 'https://', paper['openAccessPdf'])
            filename = []
            driver.set_page_load_timeout(timeout_seconds)
            driver.get(clean_url)
            end_time = time.time() + timeout_seconds
            end_time_startdw = time.time() + timeout_startdw_seconds
            
            while time.time() < end_time:
                time.sleep(0.1)
                filename = [fname for fname in glob.glob(os.path.join(custom_download_dir, '*.pdf')) if not any(id in fname for id in processed_list)]
                if filename:
                    processed_list = [id for id in processed_list if not any(id in fname for fname in filename)]
                    if len(filename) > 1:
                        raise ImportError('more than one download to process', filename, processed_list)
                    filename = filename[0]
                    break
                if any(marker.lower() in driver.page_source.lower() for marker in block_markers):
                    filename = None
                    break
                if time.time() > end_time_startdw and not glob.glob(os.path.join(custom_download_dir, '*.crdownload')):
                    print(f"Paper {k}: Download not started after {timeout_startdw_seconds} seconds for {clean_url}.")
                    filename = None
                    break
            else:
                print(f"Paper {k}: Download not finished after {timeout_seconds} seconds for {clean_url}.")
                filename = None
        
            if filename:
                filepath = os.path.join(custom_download_dir, filename)
                pdf_data,_,_ = extract_pdf_sections(filepath, paper['authorName'], paper['paperId'])
                processed_list.append(paper['paperId'])
                pdf_data['problem'] = None
                shutil.rmtree(custom_download_dir)
                # os.remove(filepath)
            else:
                pdf_data = {'paperId': txtdata[k]['paperId'], 'author_list': paper['authorName'], 'pdf_text': None, 'pdf_abstract': None, 'pdf_introduction': None, 
                            'pdf_discussion': None, 'pdf_results': None, 'pdf_methods': None, 'problem': 'captcha'}
        except (TimeoutException, WebDriverException):
            print(f"Paper {k}: Error loading {clean_url}")
            pdf_data = {'paperId': txtdata[k]['paperId'], 'author_list': paper['authorName'], 'pdf_text': None, 'pdf_abstract': None, 'pdf_introduction': None, 
                        'pdf_discussion': None, 'pdf_results': None, 'pdf_methods': None, 'problem': 'broken link'}
        
        text_pdfs.append(pdf_data)
        fraction_done = (k + 1) / len(txtdata) * 100
        if fraction_done > fraction_print:
            print(f"{fraction_done:.0f}% of papers processed in {downloads_dir}")
            fraction_print = fraction_print + 20

    driver.quit()

    #Extracting pdfs from downloads that took too long to be processed immediately
    if os.path.isdir(downloads_dir):
        for dirname in os.listdir(downloads_dir):
            dir_path = os.path.join(downloads_dir, dirname)
            if os.path.isdir(dir_path) and not glob.glob(os.path.join(dir_path, '*.pdf')):
                shutil.rmtree(dir_path)
            elif glob.glob(os.path.join(dir_path, '*.pdf')):
                filepath = glob.glob(os.path.join(dir_path, '*.pdf'))[0]
                paper_data = next((pdf_data for pdf_data in text_pdfs if pdf_data['paperId'] == dirname), None)
                if paper_data:
                    pdf_data,_,_ = extract_pdf_sections(filepath, paper_data['author_list'], paper_data['paperId'])
                    for key in pdf_data.keys():
                        paper_data[key] = pdf_data[key]
                    paper_data['problem'] = None
                
        shutil.rmtree(downloads_dir)

    return text_pdfs

In [None]:
start_year = 2020
end_year = 2025
num_processes = 10
download_basedir = '/home/jul/DST/recoRAG/data/pdf'

for year in range(start_year, end_year+1):
    print(f"Downloading pdfs for year {year}")
    metadata, textdata = load_dataset(dataset_source, years=year)
    N = len(textdata)
    batch_size = 100 #len(textdata) // num_processes
    data4pdf = pd.merge(textdata[['paperId','openAccessPdf']], metadata[['paperId','authorName']], on='paperId', how='left')
    textdata_list = [(data4pdf.iloc[k*batch_size:min(N, (k+1)*batch_size)].to_dict(orient='records'), os.path.join(download_basedir, str(k))) for k in range(N // batch_size + 1)]
    textdata_list = [batch for batch in textdata_list if len(batch[0]) > 0]
    with multiprocessing.Pool(processes=num_processes) as pool:
        results = list(pool.imap(download_pdfs, textdata_list))

    pdf_texts = pd.DataFrame([pdf for result in results for pdf in result])
    textdata_pdf = pd.merge(left=textdata, right=pdf_texts, on='paperId', how='left')

    textdata_pdf[textdata_pdf['abstract']=='None']['abstract'] = textdata_pdf[textdata_pdf['abstract']=='None']['pdf_abstract']

    filepath = os.path.join(path_to_data, 'text', str(year), f'{dataset_source}_textpdf_{year}.parquet')
    textdata_pdf.to_parquet(filepath, engine="pyarrow", compression="snappy", index=True)

In [68]:
(textdata['abstract']=='None').sum()

np.int64(76016)

In [None]:
pdf_data, pdf_sections, pdf_tags = extract_pdf_sections(os.path.join(path_to_data, 'pdf', '0', 'PIIS009286740500098X (1).pdf'),'I. Krylova,E. Sablin,Jamie M. R. Moore,R. Xu,G. Waitt,J. A. MacKay,D. Juzumiene,J. Bynum,K. Madauss,V. Montana,L. Lebedeva,M. Suzawa,Jon D. Williams,Shawn P. Williams,R. Guy,J. W. Thornton,R. Fletterick,T. Willson,H. Ingraham', 0)