In [10]:
import requests
import time
import pandas as pd
import urllib
from urllib.parse import urlparse, urlunparse
from io import BytesIO
import os
import undetected_chromedriver as uc
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
import tempfile
import time
import re
import xml.etree.ElementTree as ET
import json

In [2]:
download_dir = tempfile.mkdtemp() # os.getcwd() + '/data/pdfs'
chrome_options = uc.ChromeOptions()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,  # Set download location
    "download.prompt_for_download": False,       # Disable download prompts
    "plugins.always_open_pdf_externally": True   # Download PDFs instead of opening them
})
service = Service(ChromeDriverManager().install())
driver = uc.Chrome(service=service, options=chrome_options)
driver.implicitly_wait(3)

### Scraping Algorithm

In [3]:
def get_base_url(url):
    parsed_url = urlparse(url)
    # Reconstruct URL without query parameters and fragment
    return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, '', '', ''))

In [4]:
def open_pdf_if_button(driver):
    try:
        driver.find_element(By.XPATH, "//embed[contains(@type, 'application/pdf')]")
        return True
    except:
        pass
    try:
        for iframe in driver.find_elements(By.TAG_NAME, "iframe"):
            try:
                frame_type = iframe.get_attribute("type")
                if frame_type == "application/pdf":
                    driver.get(iframe.get_attribute("src"))
                    return True
            except:
                print(f"Failed to get link {iframe}")
                return False
    except:
        print("No open button found for current PDF")
    return False

In [11]:
def download_pdf_urls(url, paper_index):

    driver.get(url)
    try:
    # Wait for up to 5 seconds for the button to appear and be clickable
        cookie_button = WebDriverWait(driver, 3).until(
        EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all cookies')]"))
    )
        cookie_button.click()
        print("Accepted cookies.")
    except:
        print("No 'Accept all cookies' button found or it is not clickable.")


    pdfs_unique = set()
    pdf_links = []
    pdf_pattern = re.compile(r'(?<!e)\.pdf$|/pdf/|/articlepdf/|/article-pdf/', re.IGNORECASE)
    for link in driver.find_elements(By.TAG_NAME, "a"):
        try:
            href = link.get_attribute("href")
            try:
                link.find_element(By.XPATH, "./ancestor::div[@id='recommended-articles']")
                continue #skips if link is in recommended
            except NoSuchElementException:
                try:    
                    link.find_element(By.XPATH, "./ancestor::ol[@class='references']")
                    continue #skips if link is in references
                except:
                    if href and "scholar.google" not in href and pdf_pattern.search(href): #selenium cannot download epdfs
                        base_url = get_base_url(href)
                        if base_url not in pdfs_unique:
                            pdfs_unique.add(base_url)
                            pdf_links.append(href)
        except Exception as e:
            print(e)
            break
            print(f"Failed to get link {link}")
    i = 0
    if len(pdf_links) == 0:
        print(f"No PDF links found for paper {url}")
        return
    downloadable_links_count = 0
    for pdf_link in pdf_links:
        # Ensure each link is a full URL
        pdf_url = pdf_link if pdf_link.startswith('http') else get_base_url(url) + pdf_link
        if "pdf" not in pdf_url: #skips non-pdfs after base url is used
            continue
        try:
            num_of_files_prev = len([f for f in os.listdir(download_dir)])
            curr_url = driver.current_url
            driver.get(pdf_url)
            if curr_url != driver.current_url: # redirected to another page
                open_pdf_if_button(driver)
            time.sleep(1)
            num_of_files_now = len([f for f in os.listdir(download_dir)])
            downloadable_links_count += num_of_files_now > num_of_files_prev
            # if downloaded:
            #     time.sleep(1)
            #     os.chdir(download_dir)
            #     files = filter(os.path.isfile , os.listdir(download_dir)) 
            #     files = [os.path.join(download_dir, f) for f in files]
            #     files.sort(key=lambda x: os.path.getmtime(x))
            #     newest_file = files[-1]
            #     os.rename(newest_file, f"{paper_index}_{i}.pdf")
        except Exception as e:
            print(e)
            print(f"Skipping invalid PDF at {pdf_url}")
            continue
    downloaded_pdfs = [f for f in os.listdir(download_dir) if f.endswith('.pdf')]
    while len(downloaded_pdfs) < downloadable_links_count:
        time.sleep(1)
        downloaded_pdfs = [f for f in os.listdir(download_dir) if f.endswith('.pdf')]
    print(f"Downloaded {len(downloaded_pdfs)} PDFs for {url}")
    pdf_files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) if f.endswith('.pdf')]
    i = 1
    for pdf in pdf_files:
        output_path = os.path.abspath(os.getcwd() +f'/data/pdfs/{paper_index}_{i}.pdf')
        os.rename(pdf, output_path)
        i += 1 

        
    # for pdf in pdf_files:
    #     if os.path.exists(pdf):
    #         os.remove(pdf)
    return len(pdf_files)

In [12]:
dataset = pd.read_csv('../data/crossref_data.csv')

In [13]:
dataset[dataset["URL"].str.contains('\/j\.') & (dataset["publisher"] == "Elsevier BV")]

Unnamed: 0,DOI,URL,year,title,publisher
16,10.1016/j.matlet.2024.137407,https://doi.org/10.1016/j.matlet.2024.137407,2024,['Defect passivation of organometal halide per...,Elsevier BV
39,10.1016/j.cej.2024.152955,https://doi.org/10.1016/j.cej.2024.152955,2024,['4-Methoxy phenethylammonium halide salts for...,Elsevier BV
44,10.1016/j.solmat.2023.112630,https://doi.org/10.1016/j.solmat.2023.112630,2023,['Surface defect passivation by copper incorpo...,Elsevier BV
46,10.1016/j.solener.2024.112968,https://doi.org/10.1016/j.solener.2024.112968,2024,['Defects passivation in chloride-iodide perov...,Elsevier BV
47,10.1016/j.cej.2024.150903,https://doi.org/10.1016/j.cej.2024.150903,2024,['In-Situ passivation of mixed-halide perovski...,Elsevier BV
...,...,...,...,...,...
49943,10.1016/j.jobe.2019.101080,https://doi.org/10.1016/j.jobe.2019.101080,2019,['Multi-objective design of grid-tied solar ph...,Elsevier BV
49948,10.1016/j.solener.2020.04.040,https://doi.org/10.1016/j.solener.2020.04.040,2020,['Experimental investigation on the geometric ...,Elsevier BV
49949,10.1016/j.solener.2020.04.090,https://doi.org/10.1016/j.solener.2020.04.090,2020,['Thermal performance evaluation of a passive ...,Elsevier BV
49950,10.1016/j.jastp.2020.105472,https://doi.org/10.1016/j.jastp.2020.105472,2020,['Trends of thermodynamic indices thresholds o...,Elsevier BV


In [14]:
ssrn = dataset[dataset["URL"].str.contains("ssrn")]
ssrn["URL"]

30       https://doi.org/10.2139/ssrn.4918801
33       https://doi.org/10.2139/ssrn.4803193
53       https://doi.org/10.2139/ssrn.4972021
55       https://doi.org/10.2139/ssrn.5070698
60       https://doi.org/10.2139/ssrn.4932764
                         ...                 
49515    https://doi.org/10.2139/ssrn.3604729
49546    https://doi.org/10.2139/ssrn.3603627
49859    https://doi.org/10.2139/ssrn.3517092
49880    https://doi.org/10.2139/ssrn.3735400
49894    https://doi.org/10.2139/ssrn.3716945
Name: URL, Length: 1794, dtype: object

### Running Scraping

In [15]:
i = 2000
failed_links = []
for index, row in dataset.iterrows():
    url = row["URL"]
    url = "https://doi.org/10.1016/j.matlet.2024.137407"
    print(url)
    num_links = download_pdf_urls(url, i)
    if num_links == 0:
        failed_links.append(row)
    i += 1
    time.sleep(1)
    break

https://doi.org/10.1016/j.matlet.2024.137407
No 'Accept all cookies' button found or it is not clickable.
Downloaded 1 PDFs for https://doi.org/10.1016/j.matlet.2024.137407


FileNotFoundError: [Errno 2] No such file or directory: '/var/folders/8g/4zq2fslx2tv4wd4xlkpfgyb80000gn/T/tmp8lhpc3fb/1-s2.0-S0167577X24015477-main.pdf' -> '/Users/nic-macbook/repos/DSC180_B11_Q2/q2_submission_notebooks/data/pdfs/2000_1.pdf'

In [40]:
print(os.getcwd())

c:\Users\nicco\source\repos\DSC180_B11_Q2


Convert PDFs to XML using GROBID

In [None]:
grobid_url = "http://localhost:8070/api/processFulltextDocument"
xml_names = os.listdir("../../data/xmls")

for pdf_file in os.listdir("../../data/pdfs"):
    
    #only looks at pdf files
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join("../../data/pdfs", pdf_file)
        #doe not convert already converted files
        if pdf_path.replace('.pdf', '.xml') in xml_names:
            continue
        with open(pdf_path, 'rb') as file:
            #GROBID must be running on port 8070 for this to work
            response = requests.post(
                grobid_url,
                files={'input': file},
                headers={'Accept': 'application/xml'}
            )

            if response.status_code == 200:
                xml_file_path = os.path.join('../../data/xmls', pdf_file.replace('.pdf', '.xml'))
                with open(xml_file_path, 'w', encoding='utf-8') as xml_file:
                    xml_file.write(response.text)
            else:
                print(f"Failed to convert {pdf_file}. Status code: {response.status_code}")
                print(response.text)