In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib
import requests
from urllib.parse import urlparse
from io import BytesIO
import pymupdf
import os

## Loading in dataset

In [2]:
merged_df = pd.read_csv('../data/merged_label.csv')
merged_df.head()

Unnamed: 0,link,label,text
0,https://www.nature.com/articles/s41566-019-0398-2,1,Surface passivation of perovskite film for eff...
1,https://www.nature.com/articles/s41560-020-007...,1,Intact 2D/3D halide junction perovskite solar ...
2,https://www.nature.com/articles/s41467-021-236...,1,Multication perovskite 2D/3D interfaces form v...
3,https://doi.org/10.1038%2Fs41586-022-04604-5,1,Stability-limiting heterointerfaces of perovsk...
4,https://doi.org/10.1038%2Fs41467-022-30426-0,1,Imaging and quantifying non-radiative losses a...


## Making PDFs of each paper

In [None]:
def get_base_url(url):
    """Gets the base url from a given url (i.e. https://www.nature.com)"""
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}"


In [None]:
def get_page(url):
    """Gets the content of a given url with user headers"""
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
    req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
    req.add_header('Accept-Language', 'en-US,en;q=0.5')

    r = urllib.request.urlopen(req)
    return r

In [None]:
def get_actual_url(url):
    """Gets the url after redirection for a given url"""
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
    r = requests.get(url, headers=headers, allow_redirects=True)
    parsed_uri = urlparse(r.url)
    print(r.url)
    return f'{parsed_uri.scheme}://{parsed_uri.netloc}/'

In [None]:
def reset_eof(pdf_content):
    """Some PDFs have an early EOF marker, which confuses PyMuPDF, 
    this moves it to the end of the file. Sometimes there is no EOF marker, 
    so we add one."""
    # find the line position of the EOF
    EOF_MARKER = b'%%EOF'
    if EOF_MARKER in pdf_content:
        # we can remove the early %%EOF and put it at the end of the file
        pdf_content = pdf_content.replace(EOF_MARKER, b'')
        pdf_content = pdf_content + EOF_MARKER
    else:
        # Some files really don't have an EOF marker
        # printed b'\n%%EO%E'
        pdf_content = pdf_content[:-6] + EOF_MARKER
    return pdf_content

In [None]:
def download_all_pdfs(url, paper_index):
    """Finds and downloads all pdfs for a given paper. 
    Uses PyMuPDF to download and merge all pdfs on the page into one."""
    page = get_page(url)
    url = get_actual_url(url)
    print(url)
    soup = BeautifulSoup(page.read().decode("utf-8"), 'html.parser')
    
    # Find all unique PDF links on the page
    pdf_links = [link.get('href') for link in soup.find_all('a', href=True) \
                 if link.get('href') and 'pdf' in urlparse(link.get('href')).path.lower()]
    pdf_links = list(set(pdf_links))
    
    # Initialize a new PDF document to store the merged pages
    merged_pdf = pymupdf.open()
    i = 0
    print(pdf_links)
    for pdf_link in pdf_links:
        # Ensure each link is a full URL
        pdf_url = pdf_link if pdf_link.startswith('http') else get_base_url(url) + pdf_link
        print(pdf_url)
        try:
            res = requests.get(pdf_url, allow_redirects=True)
            print(f"Processing PDF URL: {res.url}")
        except:
            print(f"Skipping invalid PDF at {pdf_url}")
            continue
        
        # Load PDF content into a PyMuPDF document
        try:
            pdf_stream = BytesIO(res.content)
            pdf_document = pymupdf.open(stream=pdf_stream, filetype="pdf")
            
            # Append each page to the merged PDF
            for page_num in range(pdf_document.page_count):
                merged_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
                
            pdf_document.close()
            i += 1
        except Exception as e:
            print(f"Skipping invalid PDF at {pdf_url}: {e}")
    
    # Save the merged PDF to disk
    if (merged_pdf.page_count > 0):
        merged_pdf.save(f'../data/pdfs/{paper_index}.pdf')
        merged_pdf.close()
        print(f"Merged PDF saved as '../data/pdfs/{paper_index}.pdf'")
    else:
        print(f"No PDF links found for index {paper_index}.")
        #TODO : use selenium to download the pdf by printing

In [None]:
#creating pdfs for each row in the merged dataframe
for index, row in merged_df.iterrows():
    print(row['link'])
    download_all_pdfs(row['link'], index)

In [107]:
existing_files = {os.path.splitext(file)[0] for file in os.listdir('../data/pdfs')}
for index, row in merged_df.iterrows():
    index_str = str(index)
    if index_str not in existing_files:
        #showing which pdf prints were not successful (done manually for now, will be done with Selenium later)
        print(index)
        print(row['link'])

38
https://pubs.rsc.org/en/content/articlehtml/2019/ta/c9ta01070j
39
https://pubs.rsc.org/en/content/articlehtml/2019/ee/c9ee00453j
71
https://doi.org/10.48550/arXiv.2102.10399
74
https://doi.org/10.1146/annurev-food-022814-015651
76
https://doi.org/10.1039/C9CS00711C
82
https://doi.org/10.1039/TF937330008B
90
https://doi.org/10.1039%2FC7TA00434F
97
https://doi.org/10.1017/CBO9780511608810
100
https://doi.org/10.1146/annurev.bb.12.060183.001035
105
https://doi.org/10.1146/annurev.physchem.51.1.209
120
https://doi.org/10.1039/D1SM01707A
125
https://doi.org/10.1039%2FC4TA05033A
128
https://doi.org/10.1146/annurev.ms.12.080182.000535
129
https://doi.org/10.1039/DC9786500007


## Convert PDFs to XML using GROBID

In [None]:
grobid_url = "http://localhost:8070/api/processFulltextDocument"
xml_names = os.listdir("../data/xmls")

for pdf_file in os.listdir("../data/pdfs"):
    #only looks at pdf files
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join("../data/pdfs", pdf_file)
        #doe not convert already converted files
        if pdf_path.replace('.pdf', '.xml') in xml_names:
            continue
        with open(pdf_path, 'rb') as file:
            #GROBID must be running on port 8070 for this to work
            response = requests.post(
                grobid_url,
                files={'input': file},
                headers={'Accept': 'application/xml'}
            )

            if response.status_code == 200:
                xml_file_path = os.path.join('../data/xmls', pdf_file.replace('.pdf', '.xml'))
                with open(xml_file_path, 'w', encoding='utf-8') as xml_file:
                    xml_file.write(response.text)
                print(f"Converted {pdf_file} to XML.")
            else:
                print(f"Failed to convert {pdf_file}. Status code: {response.status_code}")
                print(response.text)