In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib
import requests
from urllib.parse import urlparse, urlunparse
from io import BytesIO
import pymupdf
import os
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import tempfile
import time
import re
import xml.etree.ElementTree as ET
import grobid_tei_xml
import json

## Loading in dataset

In [5]:
good_papers = pd.read_csv("../data/good_paper_links.csv")

## Scraping with Selenium

In [16]:
download_dir = tempfile.mkdtemp()
chrome_options = uc.ChromeOptions()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,  # Set download location
    "download.prompt_for_download": False,       # Disable download prompts
    "plugins.always_open_pdf_externally": True   # Download PDFs instead of opening them
})
driver = uc.Chrome(options=chrome_options)

In [17]:
download_dir

'C:\\Users\\nicco\\AppData\\Local\\Temp\\tmpnpfzn9o3'

In [18]:
def get_base_url(url):
    parsed_url = urlparse(url)
    # Reconstruct URL without query parameters and fragment
    return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, '', '', ''))

In [19]:
def open_pdf_if_button(driver):
    try:
        driver.find_element(By.XPATH, "//embed[contains(@type, 'application/pdf')]")
        return True
    except:
        pass
    try:
        for iframe in driver.find_elements(By.TAG_NAME, "iframe"):
            try:
                frame_type = iframe.get_attribute("type")
                if frame_type == "application/pdf":
                    driver.get(iframe.get_attribute("src"))
                    return True
            except:
                print(f"Failed to get link {iframe}")
                return False
    except:
        print("No open button found for current PDF")
    return False


In [29]:
def download_pdf_urls(url, paper_index):

    driver.get(url)

    pdfs_unique = set()
    pdf_links = []
    pdf_pattern = re.compile(r'(?<!e)\.pdf$|/pdf/|/articlepdf/|/article-pdf/', re.IGNORECASE)
    for link in driver.find_elements(By.TAG_NAME, "a"):
        try:
            href = link.get_attribute("href")
            if href and "scholar.google" not in href and pdf_pattern.search(href): #selenium cannot download epdfs
                base_url = get_base_url(href)
                if base_url not in pdfs_unique:
                    pdfs_unique.add(base_url)
                    pdf_links.append(href)
        except:
            print(f"Failed to get link {link}")
            return
    merged_pdf = pymupdf.open()
    i = 0
    if len(pdf_links) == 0:
        print(f"No PDF links found for paper {url}")
        return
    downloadable_links_count = 0
    for pdf_link in pdf_links:
        # Ensure each link is a full URL
        pdf_url = pdf_link if pdf_link.startswith('http') else get_base_url(url) + pdf_link
        if "pdf" not in pdf_url: #skips non-pdfs after base url is used
            continue
        try:
            num_of_files_prev = len([f for f in os.listdir(download_dir)])
            curr_url = driver.current_url
            driver.get(pdf_url)
            if curr_url != driver.current_url: # redirected to another page
                open_pdf_if_button(driver)
            time.sleep(1)
            num_of_files_now = len([f for f in os.listdir(download_dir)])
            downloadable_links_count += num_of_files_now > num_of_files_prev
        except:
            print(f"Skipping invalid PDF at {pdf_url}")
            continue
    downloaded_pdfs = [f for f in os.listdir(download_dir) if f.endswith('.pdf')]
    print(downloaded_pdfs)
    while len(downloaded_pdfs) < downloadable_links_count:
        time.sleep(1)
        downloaded_pdfs = [f for f in os.listdir(download_dir) if f.endswith('.pdf')]
        
    pdf_files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) if f.endswith('.pdf')]
    output_path = f'../../data/pdfs/{paper_index}.pdf'
    for pdf in pdf_files:
        print(pdf)
        merged_pdf.insert_pdf(pymupdf.open(pdf))
    merged_pdf.save(output_path)
    merged_pdf.close()
    
    for pdf in pdf_files:
        os.remove(pdf)
    print(f"Merged PDF saved as {output_path}")
    return pdf_links

In [None]:
for index, row in good_papers.iterrows():
    download_pdf_urls(row['Link'], index)

TODO: use base url for checking duplicates but the actual url for downloading (reference i = 12)
TODO: not getting links for i = 40

## Convert PDFs to XML using GROBID

In [45]:
grobid_url = "http://localhost:8070/api/processFulltextDocument"
xml_names = os.listdir("../../data/xmls")

for pdf_file in os.listdir("../../data/pdfs"):
    #only looks at pdf files
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join("../../data/pdfs", pdf_file)
        #doe not convert already converted files
        if pdf_path.replace('.pdf', '.xml') in xml_names:
            continue
        with open(pdf_path, 'rb') as file:
            #GROBID must be running on port 8070 for this to work
            response = requests.post(
                grobid_url,
                files={'input': file},
                headers={'Accept': 'application/xml'}
            )

            if response.status_code == 200:
                xml_file_path = os.path.join('../../data/xmls', pdf_file.replace('.pdf', '.xml'))
                with open(xml_file_path, 'w', encoding='utf-8') as xml_file:
                    xml_file.write(response.text)
            else:
                print(f"Failed to convert {pdf_file}. Status code: {response.status_code}")
                print(response.text)

Failed to convert 38.pdf. Status code: 500
[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 139


In [46]:
def xml_to_text(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    print(file_path)
    print(root.find(".//title"))
    title = ""
    sections = []
    figures = []
    
    namespace = { 'd': root.tag.split('}')[0].strip('{') if '}' in root.tag else '' }
    def ns_tag(tag):
        return f"{{{namespace}}}tag" if namespace else tag

    print(namespace)
    namespace = { 'd': "http://www.tei-c.org/ns/1.0" }
    title_element = root.find('d:title', namespace)
    if title_element is not None:
        title = title_element.text
    
    for div in root.findall(".//div"):
        section_text = "/n".join(div.itertext())
        sections.append(section_text)
    
    for figure in root.findall(".//figure"):
        fig_head = figure.find(".//head")
        fig_description = figure.find(".//figDesc")
        fig_info = (fig_head.text if fig_head else "Fig:") + " " 
        + (fig_description.text if fig_description else "unkown description")
        figures.append(fig_info)
    return title + "/n" + "/n".join(sections) + "/n" + "/n".join(figures)



In [51]:
def parse_grobid_xml(file_path):
    with open(file_path, "r") as xml_file:
        doc = grobid_tei_xml.parse_document_xml(xml_file.read())
        title = doc.header.title or ""
        abstract = doc.abstract or ""
        body = doc.body or ""
        index = file_path.split("/")[-1].split(".")[0]
        return f"Paper #: {index}\n{title}\n{abstract}\n{body}" #title, abstract, body
    # print(json.dumps(doc.to_dict(), indent=2))

In [None]:
xml_dir = "../../data/xmls"
txt_dir = "../../data/txts"
for filename in os.listdir("../../data/xmls"):
    if filename.endswith(".xml"):
        print(filename)
        txt_content = parse_grobid_xml(os.path.join(xml_dir, filename))
        txt_file = os.path.join(txt_dir, f"{os.path.splitext(filename)[0]}.txt")
        with open(txt_file, "w", encoding="utf-8") as f:
            f.write(txt_content)

NameError: name 'os' is not defined

In [27]:
good_papers.iloc[34]

Link    https://doi.org/10.1063%2F1.1840606
Name: 34, dtype: object

In [30]:
for index, row in good_papers.iterrows():
    if index == 34:
        print(row['Link'])
        download_pdf_urls(row['Link'], index)

https://doi.org/10.1063%2F1.1840606
['4590_1_online.pdf']
C:\Users\nicco\AppData\Local\Temp\tmpnpfzn9o3\4590_1_online.pdf
C:\Users\nicco\AppData\Local\Temp\tmpnpfzn9o3\ContentPlatform_UserGuide_FINAL.pdf
Merged PDF saved as ../../data/pdfs/34.pdf
