In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
import urllib
from urllib.parse import urlparse, urlunparse
from io import BytesIO
import pymupdf
import os
import undetected_chromedriver as uc
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import tempfile
import time
import re
import xml.etree.ElementTree as ET
import grobid_tei_xml
import json

In [2]:
query = "perovskite solar halide passivation"
base_url = f"https://api.crossref.org/works"

rows_per_request = 1000
offset = 0
all_dois = []

while offset < 2000:
    url = f"{base_url}?filter=from-pub-date:2024&query={query}&rows={rows_per_request}&offset={offset}"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed request at offset {offset}: {response.status_code}")
        break

    data = response.json()
    items = data['message']['items']

    if not items:  # Stop when there are no more results
        break

    # Extract DOIs
    for item in items:
        if 'DOI' in item:
            all_dois.append(item)

    print(f"Fetched {len(items)} records (Offset: {offset})")
    offset += rows_per_request  # Move to the next batch
    time.sleep(1)  # Be polite and avoid rate limits

Fetched 1000 records (Offset: 0)
Fetched 1000 records (Offset: 1000)


In [3]:
len(all_dois)

2000

In [4]:
import numpy as np

np.mean([True if 'abstract' in doi else False for doi in all_dois])

0.282

In [17]:
download_dir = '/Users/nicco/source/repos/DSC180_B11_Q2/data/pdfs' # os.getcwd() + '/data/pdfs'
chrome_options = uc.ChromeOptions()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,  # Set download location
    "download.prompt_for_download": False,       # Disable download prompts
    "plugins.always_open_pdf_externally": True   # Download PDFs instead of opening them
})
service = Service(ChromeDriverManager().install())
driver = uc.Chrome(service=service, options=chrome_options)

In [18]:
def get_base_url(url):
    parsed_url = urlparse(url)
    # Reconstruct URL without query parameters and fragment
    return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, '', '', ''))

In [19]:
def open_pdf_if_button(driver):
    try:
        driver.find_element(By.XPATH, "//embed[contains(@type, 'application/pdf')]")
        return True
    except:
        pass
    try:
        for iframe in driver.find_elements(By.TAG_NAME, "iframe"):
            try:
                frame_type = iframe.get_attribute("type")
                if frame_type == "application/pdf":
                    driver.get(iframe.get_attribute("src"))
                    return True
            except:
                print(f"Failed to get link {iframe}")
                return False
    except:
        print("No open button found for current PDF")
    return False

In [20]:
def download_pdf_urls(url, paper_index):

    driver.get(url)
    time.sleep(2)
    pdfs_unique = set()
    pdf_links = []
    pdf_pattern = re.compile(r'(?<!e)\.pdf$|/pdf/|/articlepdf/|/article-pdf/', re.IGNORECASE)
    for link in driver.find_elements(By.TAG_NAME, "a"):
        try:
            href = link.get_attribute("href")
            if href and "scholar.google" not in href and pdf_pattern.search(href): #selenium cannot download epdfs
                base_url = get_base_url(href)
                print(base_url)
                if base_url not in pdfs_unique:
                    pdfs_unique.add(base_url)
                    pdf_links.append(href)
        except:
            print(f"Failed to get link {link}")
    i = 0
    if len(pdf_links) == 0:
        print(f"No PDF links found for paper {url}")
        return
    for pdf_link in pdf_links:
        # Ensure each link is a full URL
        pdf_url = pdf_link if pdf_link.startswith('http') else get_base_url(url) + pdf_link
        if "pdf" not in pdf_url: #skips non-pdfs after base url is used
            continue
        try:
            curr_url = driver.current_url
            driver.get(pdf_url)
            downloaded = True
            if curr_url != driver.current_url: # redirected to another page
                downloaded = open_pdf_if_button(driver)
            print(downloaded)
            # if downloaded:
            #     time.sleep(1)
            #     os.chdir(download_dir)
            #     files = filter(os.path.isfile , os.listdir(download_dir)) 
            #     files = [os.path.join(download_dir, f) for f in files]
            #     files.sort(key=lambda x: os.path.getmtime(x))
            #     newest_file = files[-1]
            #     os.rename(newest_file, f"{paper_index}_{i}.pdf")
        except Exception as e:
            print(e)
            print(f"Skipping invalid PDF at {pdf_url}")
            continue
    return pdf_links

In [21]:
initial_data = all_dois[:1000]

In [22]:
for row in initial_data[5:]:
    url = row["URL"]
    url = "https://pubs.rsc.org/en/content/articlelanding/2024/qm/d4qm00560k"
    download_pdf_urls(url, row["DOI"])
    break

https://pubs.rsc.org/en/content/articlepdf/2024/qm/d4qm00560k
True


Convert PDFs to XML using GROBID

In [None]:
grobid_url = "http://localhost:8070/api/processFulltextDocument"
xml_names = os.listdir("../../data/xmls")

for pdf_file in os.listdir("../../data/pdfs"):
    
    #only looks at pdf files
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join("../../data/pdfs", pdf_file)
        #doe not convert already converted files
        if pdf_path.replace('.pdf', '.xml') in xml_names:
            continue
        with open(pdf_path, 'rb') as file:
            #GROBID must be running on port 8070 for this to work
            response = requests.post(
                grobid_url,
                files={'input': file},
                headers={'Accept': 'application/xml'}
            )

            if response.status_code == 200:
                xml_file_path = os.path.join('../../data/xmls', pdf_file.replace('.pdf', '.xml'))
                with open(xml_file_path, 'w', encoding='utf-8') as xml_file:
                    xml_file.write(response.text)
            else:
                print(f"Failed to convert {pdf_file}. Status code: {response.status_code}")
                print(response.text)