In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib
import requests
from urllib.parse import urlparse, urlunparse
from io import BytesIO
import pymupdf
import os
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import tempfile
import time
import re

## Loading in dataset

In [2]:
good_papers = pd.read_csv("../../data/good_paper_links.csv")

## Scraping with Selenium

In [3]:
download_dir = tempfile.mkdtemp()
chrome_options = uc.ChromeOptions()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,  # Set download location
    "download.prompt_for_download": False,       # Disable download prompts
    "plugins.always_open_pdf_externally": True   # Download PDFs instead of opening them
})
driver = uc.Chrome(options=chrome_options)

In [4]:
download_dir

'C:\\Users\\nicco\\AppData\\Local\\Temp\\tmppmqsds4o'

In [5]:
def get_base_url(url):
    parsed_url = urlparse(url)
    # Reconstruct URL without query parameters and fragment
    return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, '', '', ''))

In [6]:
def open_pdf_if_button(driver):
    try:
        driver.find_element(By.XPATH, "//embed[contains(@type, 'application/pdf')]")
        return True
    except:
        pass
    try:
        for iframe in driver.find_elements(By.TAG_NAME, "iframe"):
            try:
                frame_type = iframe.get_attribute("type")
                if frame_type == "application/pdf":
                    driver.get(iframe.get_attribute("src"))
                    return True
            except:
                print(f"Failed to get link {iframe}")
                return False
    except:
        print("No open button found for current PDF")
    return False


In [7]:
def download_pdf_urls(url, paper_index):

    driver.get(url)

    pdfs_unique = set()
    pdf_links = []
    pdf_pattern = re.compile(r'(?<!e)\.pdf$|/pdf/', re.IGNORECASE)
    for link in driver.find_elements(By.TAG_NAME, "a"):
        try:
            href = link.get_attribute("href")
            if href and "scholar.google" not in href and "pdf" and \
                pdf_pattern.search(href): #selenium cannot download epdfs
                base_url = get_base_url(href)
                if base_url not in pdfs_unique:
                    pdfs_unique.add(base_url)
                    pdf_links.append(href)
        except:
            print(f"Failed to get link {link}")
    merged_pdf = pymupdf.open()
    i = 0
    if len(pdf_links) == 0:
        print(f"No PDF links found for paper {url}")
        return
    downloadable_links_count = 0
    for pdf_link in pdf_links:
        # Ensure each link is a full URL
        pdf_url = pdf_link if pdf_link.startswith('http') else get_base_url(url) + pdf_link
        if "pdf" not in pdf_url: #skips non-pdfs after base url is used
            continue
        try:
            num_of_files_prev = len([f for f in os.listdir(download_dir)])
            curr_url = driver.current_url
            driver.get(pdf_url)
            if curr_url != driver.current_url: # redirected to another page
                open_pdf_if_button(driver)
            time.sleep(1)
            num_of_files_now = len([f for f in os.listdir(download_dir)])
            downloadable_links_count += num_of_files_now > num_of_files_prev
        except:
            print(f"Skipping invalid PDF at {pdf_url}")
            continue
    downloaded_pdfs = [f for f in os.listdir(download_dir) if f.endswith('.pdf')]
    while len(downloaded_pdfs) < downloadable_links_count:
        time.sleep(1)
        downloaded_pdfs = [f for f in os.listdir(download_dir) if f.endswith('.pdf')]
        
    pdf_files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) if f.endswith('.pdf')]
    output_path = f'../../data/pdfs/{paper_index}.pdf'
    for pdf in pdf_files:
        print(pdf)
        merged_pdf.insert_pdf(pymupdf.open(pdf))
    merged_pdf.save(output_path)
    merged_pdf.close()
    
    for pdf in pdf_files:
        os.remove(pdf)
    print(f"Merged PDF saved as {output_path}")
    return pdf_links

In [8]:
for index, row in good_papers.iterrows():
    download_pdf_urls(row['Link'], index)

C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\science.adm9474.pdf
C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\science.adm9474_sm.pdf
Merged PDF saved as ../../data/pdfs/0.pdf
C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\41566_2019_398_MOESM1_ESM.pdf
C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\41566_2019_398_MOESM2_ESM.pdf
C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\s41566-019-0398-2.pdf
Merged PDF saved as ../../data/pdfs/1.pdf
C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\41560_2020_749_MOESM1_ESM.pdf
C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\41560_2020_749_MOESM2_ESM.pdf
C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\best-research-cell-efficiencies.20200708.pdf
C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\s41560-020-00749-7.pdf
Merged PDF saved as ../../data/pdfs/2.pdf
C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\science.abq7652.pdf
C:\Users\nicco\AppData\Local\Temp\tmppmqsds4o\science.abq7652_sm.pdf
Merged PDF saved as ../../data/pdfs/3.pdf
C:\Users\nicco\AppData\Local\T

TODO: use base url for checking duplicates but the actual url for downloading (reference i = 12)
TODO: not getting links for i = 40

## Making PDFs of each paper

In [9]:
def get_base_url(url):
    """Gets the base url from a given url (i.e. https://www.nature.com)"""
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}"


In [10]:
def get_actual_url(url):
    """Gets the url after redirection for a given url"""
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
    r = requests.get(url, headers=headers, allow_redirects=True)
    parsed_uri = urlparse(r.url)
    print(r.url)
    return f'{parsed_uri.scheme}://{parsed_uri.netloc}/'

In [11]:
def reset_eof(pdf_content):
    """Some PDFs have an early EOF marker, which confuses PyMuPDF, 
    this moves it to the end of the file. Sometimes there is no EOF marker, 
    so we add one."""
    # find the line position of the EOF
    EOF_MARKER = b'%%EOF'
    if EOF_MARKER in pdf_content:
        # we can remove the early %%EOF and put it at the end of the file
        pdf_content = pdf_content.replace(EOF_MARKER, b'')
        pdf_content = pdf_content + EOF_MARKER
    else:
        # Some files really don't have an EOF marker
        # printed b'\n%%EO%E'
        pdf_content = pdf_content[:-6] + EOF_MARKER
    return pdf_content

In [12]:
def download_all_pdfs(url, paper_index):
    """Finds and downloads all pdfs for a given paper. 
    Uses PyMuPDF to download and merge all pdfs on the page into one."""
    page = get_page(url)
    url = get_actual_url(url)
    print(url)
    soup = BeautifulSoup(page.read().decode("utf-8"), 'html.parser')
    
    # Find all unique PDF links on the page
    pdf_links = [link.get('href') for link in soup.find_all('a', href=True) \
                 if link.get('href') and 'pdf' in urlparse(link.get('href')).path.lower()]
    pdf_links = list(set(pdf_links))
    
    # Initialize a new PDF document to store the merged pages
    merged_pdf = pymupdf.open()
    i = 0
    print(pdf_links)
    for pdf_link in pdf_links:
        # Ensure each link is a full URL
        pdf_url = pdf_link if pdf_link.startswith('http') else get_base_url(url) + pdf_link
        print(pdf_url)
        try:
            res = requests.get(pdf_url, allow_redirects=True)
            print(f"Processing PDF URL: {res.url}")
        except:
            print(f"Skipping invalid PDF at {pdf_url}")
            continue
        
        # Load PDF content into a PyMuPDF document
        try:
            pdf_stream = BytesIO(res.content)
            pdf_document = pymupdf.open(stream=pdf_stream, filetype="pdf")
            
            # Append each page to the merged PDF
            for page_num in range(pdf_document.page_count):
                merged_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
                
            pdf_document.close()
            i += 1
        except Exception as e:
            print(f"Skipping invalid PDF at {pdf_url}: {e}")
    
    # Save the merged PDF to disk
    if (merged_pdf.page_count > 0):
        merged_pdf.save(f'../data/pdfs/{paper_index}.pdf')
        merged_pdf.close()
        print(f"Merged PDF saved as '../data/pdfs/{paper_index}.pdf'")
    else:
        print(f"No PDF links found for index {paper_index}.")
        #TODO : use selenium to download the pdf by printing

In [13]:
#creating pdfs for each row in the merged dataframe
for index, row in merged_df.iterrows():
    print(row['link'])
    download_all_pdfs(row['link'], index)

NameError: name 'merged_df' is not defined

In [None]:
existing_files = {os.path.splitext(file)[0] for file in os.listdir('../data/pdfs')}
for index, row in merged_df.iterrows():
    index_str = str(index)
    if index_str not in existing_files:
        #showing which pdf prints were not successful (done manually for now, will be done with Selenium later)
        print(index)
        print(row['link'])

38
https://pubs.rsc.org/en/content/articlehtml/2019/ta/c9ta01070j
39
https://pubs.rsc.org/en/content/articlehtml/2019/ee/c9ee00453j
71
https://doi.org/10.48550/arXiv.2102.10399
74
https://doi.org/10.1146/annurev-food-022814-015651
76
https://doi.org/10.1039/C9CS00711C
82
https://doi.org/10.1039/TF937330008B
90
https://doi.org/10.1039%2FC7TA00434F
97
https://doi.org/10.1017/CBO9780511608810
100
https://doi.org/10.1146/annurev.bb.12.060183.001035
105
https://doi.org/10.1146/annurev.physchem.51.1.209
120
https://doi.org/10.1039/D1SM01707A
125
https://doi.org/10.1039%2FC4TA05033A
128
https://doi.org/10.1146/annurev.ms.12.080182.000535
129
https://doi.org/10.1039/DC9786500007


## Convert PDFs to XML using GROBID

In [None]:
grobid_url = "http://localhost:8070/api/processFulltextDocument"
xml_names = os.listdir("../data/xmls")

for pdf_file in os.listdir("../data/pdfs"):
    #only looks at pdf files
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join("../data/pdfs", pdf_file)
        #doe not convert already converted files
        if pdf_path.replace('.pdf', '.xml') in xml_names:
            continue
        with open(pdf_path, 'rb') as file:
            #GROBID must be running on port 8070 for this to work
            response = requests.post(
                grobid_url,
                files={'input': file},
                headers={'Accept': 'application/xml'}
            )

            if response.status_code == 200:
                xml_file_path = os.path.join('../data/xmls', pdf_file.replace('.pdf', '.xml'))
                with open(xml_file_path, 'w', encoding='utf-8') as xml_file:
                    xml_file.write(response.text)
                print(f"Converted {pdf_file} to XML.")
            else:
                print(f"Failed to convert {pdf_file}. Status code: {response.status_code}")
                print(response.text)