In [1]:
import os, glob, shutil
import sys

#Import config file. Update config.py according to your environment
from config import path_to_data

import pandas as pd
import numpy as np
from scipy import stats

import requests, json, re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from urllib3.exceptions import ReadTimeoutError
from selenium.common.exceptions import TimeoutException, WebDriverException

from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextBox

from tqdm import tqdm
import time
from datetime import datetime
import multiprocessing
import operator
import tempfile

In [2]:
def load_dataset(path_to_data, dataset_source='semanticsscholar', years=None, data_types=None):
    """Fetch dataset metadata and text data for given years and data types."""
    years = (
        [str(year) for year in years] if isinstance(years, list) else
        [str(years)] if years else
        [year for year in os.listdir(os.path.join(path_to_data, 'metadata')) if year.isdigit()]
    )
    data_types = data_types or ['metadata', 'text']
    data_types = [data_types] if not isinstance(data_types, list) else data_types

    load_metadata = 'metadata' in data_types
    load_textdata = 'text' in data_types or 'pdf' in data_types
    load_pdfdata = 'pdf' in data_types

    metadata, textdata, pdfdata = [], [], []
    metadata_years, textdata_years, pdfdata_years = [], [], []

    for year in years:
        if load_metadata:
            metadata_path = os.path.join(path_to_data, 'metadata', year, f'{dataset_source}_metadata_{year}.parquet')
            if os.path.isfile(metadata_path):
                metadata.append(pd.read_parquet(metadata_path, engine="pyarrow"))
                metadata_years.append(year)
            else:
                print(f"No metadata for year {year}")
        
        if load_textdata:
            textdata_path = os.path.join(path_to_data, 'text', year, f'{dataset_source}_text_{year}.parquet')
            if os.path.isfile(textdata_path):
                textdata.append(pd.read_parquet(textdata_path, engine="pyarrow"))
                textdata_years.append(year)
            else:
                print(f"No text data for year {year}")

        if load_pdfdata:
            pdfdata_path = os.path.join(path_to_data, 'pdf', year, f'{dataset_source}_textpdf_{year}.parquet')
            if not os.path.isfile(pdfdata_path):
                pdfdata_path = os.path.join(path_to_data, 'text', year, f'{dataset_source}_text_{year}.parquet')
                print(f"No pdf data for year {year}. Will try to load text data instead")
            if os.path.isfile(pdfdata_path):
                pdfdata.append(pd.read_parquet(pdfdata_path, engine="pyarrow"))
                pdfdata_years.append(year)
            else:
                print(f"No pdf or text data for year {year}")
    
    metadata = pd.concat(metadata, axis=0, ignore_index=True).reset_index(drop=True) if metadata else []
    textdata = pd.concat(textdata, axis=0, ignore_index=True).reset_index(drop=True) if textdata else []
    pdfdata = pd.concat(pdfdata, axis=0, ignore_index=True).reset_index(drop=True) if pdfdata else []
    
    msg_parts = []
    if load_metadata:
        msg_parts.append(f'metadata loaded for years: {metadata_years}')
    if load_textdata:
        msg_parts.append(f'text data loaded for years: {textdata_years}')
    if load_pdfdata:
        msg_parts.append(f'pdf data loaded for years: {pdfdata_years}')
    
    if msg_parts:
        print("; ".join(msg_parts))
    if load_pdfdata:
        textdata = pdfdata.set_index("paperId").combine_first(textdata.set_index("paperId")).reset_index(drop=False)

    if load_metadata and load_textdata and len(metadata) != len(textdata):
        raise ValueError("Metadata and text data don't have the same length.")
    
    output = (data for data in [metadata, textdata] if len(data) > 0)
    return output

In [3]:
def pdf_main_fontsize(pdf_path):
    try:
        sizes = [
        character.size
        for page_layout in extract_pages(pdf_path)
        for element in page_layout if isinstance(element, LTTextContainer)
        for text_line in element if isinstance(text_line, LTTextLine)
        for character in text_line if isinstance(character, LTChar)
    ]
        size_mode = stats.mode(sizes, keepdims=True)[0][0] if sizes else 0
    except Exception:
        size_mode = None
    return size_mode

def extract_pdf_sections(pdf_path, authorlist, paperId, possible_section_headings=None, size_threshold=None):
    author_last_names = [name.split()[-1] for name in authorlist.split(',')]
    last_section_names = {'acknowledgment', 'acknowledgement', 'acknowlegment', 'reference'}
    month_markers = {'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'november', 'december',
                     'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'nov', 'dec'}
    year_markers = {str(y) for y in range(1900, 2030)}
    section_list = possible_section_headings or {
        'introduction', 'results', 'discussion', 'conclusions', 'methods', 'materials', 'experimental',
        'materials and methods', 'experimental procedure', 'related work', 'i.', 'ii.', 'iii.', 'iv.', 'v.', 'vi.'
    }

    size_threshold = size_threshold if size_threshold is not None else 0.9

    page_width, page_height = 595, 842
    nword_abstract_th, nword_sections_th = 30, 30

    #Determine the most common font size (mode)
    size_mode = pdf_main_fontsize(pdf_path)

    text_blocks, tag_blocks, nwords_in_blocks = [], [], []
    section_to_find, section_heading, tag = 'AUTHORS', 'UNDEFINED', 'UNDEFINED'
    reached_end = False
    if size_mode is not None:
        for p, page_layout in enumerate(extract_pages(pdf_path)):
            if reached_end:
                break

            for element in page_layout:
                if not isinstance(element, LTTextBox):
                    continue

                x0, y0, x1, y1 = element.bbox
                if not (y0 > 0.05*page_height and y1 < 0.95*page_height and x0 > 0.05*page_width and x1 < 0.95*page_width):
                    continue

                filtered_text, sizes = [], []
                for text_line in element:
                    if isinstance(text_line, LTTextLine):
                        line_text = [character.get_text() for character in text_line]
                        sizes.extend([character.size for character in text_line if isinstance(character, LTChar)])
                        line_str = "".join(line_text).strip()
                        if not re.fullmatch(r"\d+", line_str): # Keeps lines that are NOT just numbers
                            filtered_text.append(line_str)
                        filtered_text.append("\n") # Preserve line breaks

                #joining characters in a single text while removing weird markers generated by pdfminer
                filtered_text = re.sub(r'\(cid:[^\)]+\)', '', "".join(filtered_text).strip())
                word_list = re.split(r'[\n\s]+', filtered_text.lower().strip())
                nwords = len(word_list)

                if any(end_section in ' '.join(word_list[:3]) for end_section in last_section_names):
                    reached_end = True
                    continue
                
                #removing everything before the author block as well as the correspondance fields
                if p <= 1:
                    nauthors_detected = sum(lastname.lower() in ' '.join(word_list) for lastname in author_last_names)
                    if nauthors_detected >= 0.5 * len(author_last_names) and y0 > 0.3 * page_height:
                        text_blocks, tag_blocks, nwords_in_blocks = [], [], []
                        section_to_find = 'ABSTRACT'
                        filtered_text = []
                    if nauthors_detected > 0 and '@' in filtered_text:
                        filtered_text = []

                #removing blocks likely headers with publication date
                if any([m in word_list for m in month_markers]) and any([y in word_list for y in year_markers]) and nwords < 10:
                    continue

                #removing figure captions
                if any([figname in word_list[0] for figname in ['fig', 'figure', 'table', 'image']]):
                    continue

                #removing previous block if likely a header but not followed by capitalized paragraph
                if filtered_text and not filtered_text[0].isupper() and nwords_in_blocks and nwords_in_blocks[-1] <= 3:
                    text_blocks.pop()
                    tag_blocks.pop()
                    nwords_in_blocks.pop()
                elif (filtered_text and filtered_text.strip() and filtered_text.strip()[0].isupper() and nwords_in_blocks and 
                    nwords_in_blocks[-1] <= 3 and text_blocks and any(h in re.sub(r'[\n\s]+', ' ', text_blocks[-1].lower()) for h in section_list)):
                    section_heading = ''.join([w.upper() for w in re.sub(r'[\d.]', '', text_blocks[-1])])
                    if nwords > nword_sections_th:
                        tag_blocks[-1] = section_heading

                if not reached_end and filtered_text and (max(sizes, default=0) >= size_threshold * size_mode or nwords > 50):
                    if section_to_find == 'ABSTRACT' and nwords > nword_abstract_th:
                        tag = 'ABSTRACT'
                        if word_list[-1][-1] == '.':
                            section_to_find = 'INTRODUCTION'
                    elif section_to_find == 'INTRODUCTION':
                        if nwords > nword_sections_th:
                            tag = 'INTRODUCTION'
                            section_heading, section_to_find = 'INTRODUCTION', 'NEXTHEADING'
                        else:
                            tag = 'UNDEFINED'
                    elif section_to_find == 'NEXTHEADING' and nwords > nword_sections_th:
                        tag = section_heading
                    
                    text_blocks.append(filtered_text)
                    tag_blocks.append(tag)
                    nwords_in_blocks.append(nwords)
    else:
        print(f"PDF {pdf_path} likely corrupted")

    sections = {'paperId': paperId,
            'pdf_abstract': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'abstract'})),
            'pdf_introduction': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'introduction', 'related work', 'i.', 'ii.'})),
            'pdf_results': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'results', 'experiment', 'i.', 'ii.'})),
            'pdf_discussion': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'discussion', 'conclusion', 'v.', 'vi.'})),
            'pdf_methods': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if any(s in t.lower() for s in {'methods', 'materials', 'experimental', 'materials and methods', 'experimental procedure'})),
            'pdf_text': '\n'.join(text for t, text in zip(tag_blocks, text_blocks) if not any(s in t.lower() for s in {'undefined'})),
            'author_list': authorlist
            }
    
    return sections, text_blocks, tag_blocks

In [None]:
def extract_pdfs(inputs):
    #you'll need to have ChromeDriver installed
    txtdata, downloads_dir = inputs
    service = Service(executable_path="/usr/bin/chromedriver")
    os.makedirs(downloads_dir, exist_ok=True)

    txtdata, downloads_dir = inputs
    service = Service(executable_path="/usr/bin/chromedriver")
    run_headless = False #True #

    timeout_enddw = 10
    timeout_startdw = 5
    timeout_loadpage = timeout_startdw + timeout_enddw
    block_markers = {"captcha", "verify you are human", "not a robot", "forbidden", "this site can’t be reached"}

    options = webdriver.ChromeOptions()
    user_data_dir = tempfile.mkdtemp(prefix="chrome-profile-", dir="/tmp")
    options.add_argument(f"--user-data-dir={user_data_dir}")
    options.add_argument("--no-sandbox")
    # options.add_argument("--disable-dev-shm-usage")

    if run_headless:
        options.add_argument("--headless=new")
        options.add_argument("--disable-blink-features=AutomationControlled")
    prefs = {
        "plugins.always_open_pdf_externally": True,
        "download.prompt_for_download": False,
        "download.default_directory": downloads_dir,
        "download.directory_upgrade": True
    }
    options.add_experimental_option("prefs", prefs)

    waitforSession = 0
    max_retries = 10
    retry_delay = 3
    while waitforSession < max_retries:
        try:
            driver = webdriver.Chrome(service=service, options=options)
            if run_headless:
                driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            break
        except Exception as e:
            print(f"Session creation failed. Retrying ({waitforSession+1}/{max_retries})...")
            time.sleep(retry_delay)  # Wait before retrying
            waitforSession += 1
    else:
        raise RuntimeError(f"Failed to create a Selenium session after multiple attempts. Error: {e}")

    text_pdfs = []
    fraction_print = 0
    processed_list = []

    for k, paper in enumerate(txtdata):
        try:
            paper_download_dir = os.path.join(downloads_dir, paper['paperId'])
            os.makedirs(paper_download_dir, exist_ok=True)

            # Set Chrome's download directory dynamically
            driver.execute_cdp_cmd(
                    "Page.setDownloadBehavior", 
                    {"behavior": "allow",
                    "downloadPath": paper_download_dir}
                    )
            
            clean_url = re.sub(r'^http://', 'https://', paper['openAccessPdf'])
            filename = []
            driver.set_page_load_timeout(timeout_loadpage)
            driver.get(clean_url)
            end_time_enddw = time.time() + timeout_enddw
            end_time_startdw = time.time() + timeout_startdw
            
            while time.time() < end_time_enddw:
                time.sleep(0.1)
                filename = [fname for fname in glob.glob(os.path.join(paper_download_dir, '*.pdf')) if not any(id in fname for id in processed_list)]
                if filename:
                    processed_list = [id for id in processed_list if not any(id in fname for fname in filename)]
                    if len(filename) > 1:
                        raise ImportError('more than one download to process', filename, processed_list)
                    filename = filename[0]
                    break
                if any(marker.lower() in driver.page_source.lower() for marker in block_markers):
                    filename = None
                    break
                if time.time() > end_time_startdw and not glob.glob(os.path.join(paper_download_dir, '*.crdownload')):
                    filename = None
                    break
            else:
                filename = None
            
        
            if filename:
                filepath = os.path.join(paper_download_dir, filename)
                pdf_data,_,_ = extract_pdf_sections(filepath, paper['authorName'], paper['paperId'])
                processed_list.append(paper['paperId'])
                pdf_data['openAccessPdf'] = paper['openAccessPdf']
                pdf_data['problem'] = None
                shutil.rmtree(paper_download_dir)
                # os.remove(filepath)
            else:
                pdf_data = {'paperId': txtdata[k]['paperId'], 'author_list': paper['authorName'], 'pdf_text': None, 'pdf_abstract': None, 'pdf_introduction': None, 
                            'pdf_discussion': None, 'pdf_results': None, 'pdf_methods': None, 'problem': 'captcha', 'openAccessPdf': paper['openAccessPdf']}
        except Exception:
            pdf_data = {'paperId': txtdata[k]['paperId'], 'author_list': paper['authorName'], 'pdf_text': None, 'pdf_abstract': None, 'pdf_introduction': None, 
                        'pdf_discussion': None, 'pdf_results': None, 'pdf_methods': None, 'problem': 'broken link', 'openAccessPdf': paper['openAccessPdf']}
        
        text_pdfs.append(pdf_data)
        fraction_done = (k + 1) / len(txtdata) * 100
        if fraction_done > fraction_print:
            fraction_print = fraction_print + 20

    driver.quit()
    try:
        os.rmdir(user_data_dir)
    except Exception:
        pass

    #Extracting pdfs from downloads that took too long to be processed immediately
    if os.path.isdir(downloads_dir):
        for dirname in os.listdir(downloads_dir):
            dir_path = os.path.join(downloads_dir, dirname)
            if os.path.isdir(dir_path) and not glob.glob(os.path.join(dir_path, '*.pdf')):
                shutil.rmtree(dir_path)
            elif glob.glob(os.path.join(dir_path, '*.pdf')):
                filepath = glob.glob(os.path.join(dir_path, '*.pdf'))[0]
                paper_data = next((pdf_data for pdf_data in text_pdfs if pdf_data['paperId'] == dirname), None)
                if paper_data:
                    pdf_data,_,_ = extract_pdf_sections(filepath, paper_data['author_list'], paper_data['paperId'])
                    for key in pdf_data.keys():
                        paper_data[key] = pdf_data[key]
                    paper_data['problem'] = None
                
        shutil.rmtree(downloads_dir)

    return text_pdfs

In [8]:
def download_PDFs(path_to_data, dataset_source=None, years=None, n_jobs=None, filters=None):
    dataset_source = 'semanticsscholar' if dataset_source is None else dataset_source
    years = [datetime.now().year] if years is None else years
    years = [years] if not isinstance(years, list) else years

    download_basedir = os.path.join(path_to_data, 'temp')
    os.makedirs(download_basedir, exist_ok=True)
    output_dir = os.path.join(path_to_data, 'pdf')
    os.makedirs(output_dir, exist_ok=True)

    #Cleaning-up tmp dir
    for dirname in os.listdir("/tmp"):
        if "chrome-profile-" in dirname:
            shutil.rmtree(os.path.join("/tmp", dirname))

    # filters = [['abstract','==','None']]
    filters = [filters] if filters is not None and not isinstance(filters[0], list) else filters
    n_jobs = multiprocessing.cpu_count() if n_jobs is None else n_jobs

    ops = {
            "==": operator.eq,
            "!=": operator.ne,
            ">": operator.gt,
            "<": operator.lt,
            ">=": operator.ge,
            "<=": operator.le,
            "isna": pd.isna,
            "notna": pd.notna
        }

    for year in years:
        print(f"Downloading pdfs for year {year}")
        metadata, textdata = load_dataset(path_to_data, dataset_source, years=year, data_types=['metadata', 'pdf'])
        filtered_indices = textdata.index
        if filters:
            for col, op, val in filters:
                if op in ["isna", "notna"]:
                    idx_list = textdata[ops[op](textdata[col])].index
                else:
                    idx_list = textdata[ops[op](textdata[col], val)].index

                filtered_indices = filtered_indices.intersection(idx_list)

        data4pdf = pd.merge(textdata[['paperId','openAccessPdf']].loc[filtered_indices], metadata[['paperId','authorName']].loc[filtered_indices], on='paperId', how='inner')
        if len(data4pdf) < len(filtered_indices):
            raise ValueError("List of metadata and text data are notmatching the same articles")
        
        minibatch_size = 50
        batch_size = 1 * n_jobs * minibatch_size
        Npapers = batch_size #*5 #len(data4pdf)
        with tqdm(range(Npapers // batch_size + 1), desc=f'Year {year} / {len(years)} years ({Npapers} papers on that year)', unit='batch') as pbar:
            try:
                for i in pbar:
                    batch = data4pdf.iloc[i*batch_size:min(Npapers, (i+1)*batch_size)].to_dict(orient='records')
                    Nbatch = len(batch)
                    textdata_list = [(data4pdf.iloc[k*minibatch_size:min(Nbatch, (k+1)*minibatch_size)].to_dict(orient='records'), os.path.join(download_basedir, str(k))) for k in range(Nbatch // minibatch_size + 1)]
                    textdata_list = [batch for batch in textdata_list if len(batch[0]) > 0]
                    if textdata_list:
                        with multiprocessing.Pool(processes=n_jobs) as pool:
                            results = list(pool.imap(extract_pdfs, textdata_list))

                        pdfdata = pd.DataFrame([pdf for result in results for pdf in result])
                        textdata = pdfdata.set_index("paperId").combine_first(textdata.set_index("paperId")).reset_index(drop=False)

                        mask = textdata['abstract'] == 'None'
                        textdata.loc[mask, 'abstract'] = textdata.loc[mask, 'pdf_abstract']
            except Exception as e:
                pbar.close()
                raise


        output_dir_year = os.path.join(output_dir, str(year))
        os.makedirs(output_dir_year, exist_ok=True)
        filepath = os.path.join(output_dir_year, f'{dataset_source}_textpdf_{year}.parquet')

        textdata.to_parquet(filepath, engine="pyarrow", compression="snappy", index=True)

    shutil.rmtree(download_basedir)

In [9]:
download_PDFs(path_to_data, years=2005, n_jobs=20)#, filters=[['abstract','==','None']])

Downloading pdfs for year 2005
No pdf data for year 2005. Will try to load text data instead
metadata loaded for years: ['2005']; text data loaded for years: ['2005']; pdf data loaded for years: ['2005']


Year 2005 / 1 years (1000 papers on that year):   0%|          | 0/2 [00:00<?, ?batch/s]The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/temp/2/005ae77005a4138ba8a27cd4c667a237dd268230/yigak03062.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/temp/2/005ae77005a4138ba8a27cd4c667a237dd268230/yigak03062.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/home/jul/DST/recoRAG/data/temp/3/00839b4b2347faad03b41eaadb3191b8c81efada/article.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an erro

In [10]:
metadata, textdata = load_dataset(path_to_data, years=[2005], data_types=['metadata', 'pdf'])

metadata loaded for years: ['2005']; text data loaded for years: ['2005']; pdf data loaded for years: ['2005']


In [12]:
textdata.iloc[:1000]["pdf_text"].isna().sum()

np.int64(447)

In [34]:
# Sample DataFrames
df1 = pd.DataFrame({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
})

df2 = pd.DataFrame({
    "id": [3, 4],  # id=2 and id=3 exist, id=4 is new
    "name": ["Charlie", "David"],  # id=3 has new data
    "age": [36, 40],  # Updates for id=2 and id=3
    "taille": [170, 180]
})

# 🔥 Merge and update existing rows
merged_df = df2.set_index("id").combine_first(df1.set_index("id")).reset_index(drop=False)

print(merged_df)

   id  age     name  taille
0   1   25    Alice     NaN
1   2   30      Bob     NaN
2   3   36  Charlie   170.0
3   4   40    David   180.0


In [None]:
pdf_data, pdf_sections, pdf_tags = extract_pdf_sections(os.path.join(path_to_data, 'pdf', '0', 'PIIS009286740500098X (1).pdf'),'I. Krylova,E. Sablin,Jamie M. R. Moore,R. Xu,G. Waitt,J. A. MacKay,D. Juzumiene,J. Bynum,K. Madauss,V. Montana,L. Lebedeva,M. Suzawa,Jon D. Williams,Shawn P. Williams,R. Guy,J. W. Thornton,R. Fletterick,T. Willson,H. Ingraham', 0)