## Preparing CSV file to feed into Chatextract

The csv file needs 3 column: 
- passage: Title, previous sentene, sentence we want to analyze
- sentence: Sentence we want to anaylize
- DOI


Note: We will parse the XMl extracting certain informations

In [4]:
import pandas as pd
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import urllib
import requests
from urllib.parse import urlparse, urlunparse
from io import BytesIO
import pymupdf
import os
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import tempfile
import time
import re
import xml.etree.ElementTree as ET
import grobid_tei_xml
import json

## We will gather separate pdf for each downloadable link in paper 149 and parse them into xml separately

In [35]:
good_papers = pd.read_csv("../data/good_paper_links.csv")

In [36]:
download_dir = tempfile.mkdtemp()
chrome_options = uc.ChromeOptions()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,  # Set download location
    "download.prompt_for_download": False,       # Disable download prompts
    "plugins.always_open_pdf_externally": True   # Download PDFs instead of opening them
})
driver = uc.Chrome(options=chrome_options)

In [8]:
def get_base_url(url):
    parsed_url = urlparse(url)
    # Reconstruct URL without query parameters and fragment
    return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, '', '', ''))

In [9]:
def open_pdf_if_button(driver):
    try:
        driver.find_element(By.XPATH, "//embed[contains(@type, 'application/pdf')]")
        return True
    except:
        pass
    try:
        for iframe in driver.find_elements(By.TAG_NAME, "iframe"):
            try:
                frame_type = iframe.get_attribute("type")
                if frame_type == "application/pdf":
                    driver.get(iframe.get_attribute("src"))
                    return True
            except:
                print(f"Failed to get link {iframe}")
                return False
    except:
        print("No open button found for current PDF")
    return False


In [15]:
def download_pdf_urls(url, paper_index):

    driver.get(url)

    pdfs_unique = set()
    pdf_links = []
    pdf_pattern = re.compile(r'(?<!e)\.pdf$|/pdf/|/articlepdf/|/article-pdf/', re.IGNORECASE)
    for link in driver.find_elements(By.TAG_NAME, "a"):
        try:
            href = link.get_attribute("href")
            if href and "scholar.google" not in href and pdf_pattern.search(href): #selenium cannot download epdfs
                base_url = get_base_url(href)
                if base_url not in pdfs_unique:
                    pdfs_unique.add(base_url)
                    pdf_links.append(href)
        except:
            print(f"Failed to get link {link}")
            return
    merged_pdf = pymupdf.open()
    i = 0
    if len(pdf_links) == 0:
        print(f"No PDF links found for paper {url}")
        return
    downloadable_links_count = 0
    for pdf_link in pdf_links:
        # Ensure each link is a full URL
        pdf_url = pdf_link if pdf_link.startswith('http') else get_base_url(url) + pdf_link
        if "pdf" not in pdf_url: #skips non-pdfs after base url is used
            continue
        try:
            num_of_files_prev = len([f for f in os.listdir(download_dir)])
            curr_url = driver.current_url
            driver.get(pdf_url)
            if curr_url != driver.current_url: # redirected to another page
                open_pdf_if_button(driver)
            time.sleep(1)
            num_of_files_now = len([f for f in os.listdir(download_dir)])
            downloadable_links_count += num_of_files_now > num_of_files_prev
        except:
            print(f"Skipping invalid PDF at {pdf_url}")
            continue
    downloaded_pdfs = [f for f in os.listdir(download_dir) if f.endswith('.pdf')]
    print(downloaded_pdfs)
    while len(downloaded_pdfs) < downloadable_links_count:
        time.sleep(1)
        downloaded_pdfs = [f for f in os.listdir(download_dir) if f.endswith('.pdf')]
        
    pdf_files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) if f.endswith('.pdf')]
    count = 0
    for pdf in pdf_files:
        print(pdf)
        output_path = f'{paper_index}-{count}.pdf'
        merged_pdf.insert_pdf(pymupdf.open(pdf))
        merged_pdf.save(output_path)
        merged_pdf.close()
        count += 1
        merged_pdf = pymupdf.open()
    
    for pdf in pdf_files:
        os.remove(pdf)
    # print(f"Merged PDF saved as {output_path}")
    return pdf_links

In [17]:
for index, row in good_papers.iterrows():
    if index == 149:
        download_pdf_urls(row['Link'], index)

['41586_2021_4372_MOESM1_ESM.pdf', 's41586-021-04372-8.pdf']
C:\Users\Luna\AppData\Local\Temp\tmpud7gsy94\41586_2021_4372_MOESM1_ESM.pdf
C:\Users\Luna\AppData\Local\Temp\tmpud7gsy94\s41586-021-04372-8.pdf


## Convert these PDF into XML using Grobid

1) Run Docker container
2) Go to Grobid documentation and copy the terminal command to paste on your machine
3) run the below code

In [21]:
grobid_url = "http://localhost:8070/api/processFulltextDocument"
xml_names = os.listdir("data/xmls")

for pdf_file in os.listdir("data/pdfs"):
    #only looks at pdf files
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join("data/pdfs", pdf_file)
        #doe not convert already converted files
        if pdf_path.replace('.pdf', '.xml') in xml_names:
            continue
        with open(pdf_path, 'rb') as file:
            #GROBID must be running on port 8070 for this to work
            response = requests.post(
                grobid_url,
                files={'input': file},
                headers={'Accept': 'application/xml'}
            )

            if response.status_code == 200:
                xml_file_path = os.path.join('data/xmls', pdf_file.replace('.pdf', '.xml'))
                with open(xml_file_path, 'w', encoding='utf-8') as xml_file:
                    xml_file.write(response.text)
            else:
                print(f"Failed to convert {pdf_file}. Status code: {response.status_code}")
                print(response.text)

## Convert XML into text

In [9]:
##directory to xml file
filename_sup = "149-0.xml"
filename_article = "149-1.xml"
filenames = [filename_sup, filename_article]

DOI_149 = "https://doi.org/10.1038/s41586-021-04372-8"
##This is manually looked into, for feeding chatextract to multiple paper, we need ways to aquire its correspondig link
#Link available in this csv file 
good_papers = pd.read_csv("../data/good_paper_links.csv")

In [10]:
def xml_to_text(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    print(file_path)
    print(root.find(".//title"))
    title = ""
    sections = []
    figures = []
    
    namespace = { 'd': root.tag.split('}')[0].strip('{') if '}' in root.tag else '' }
    def ns_tag(tag):
        return f"{{{namespace}}}tag" if namespace else tag

    print(namespace)
    namespace = { 'd': "http://www.tei-c.org/ns/1.0" }
    title_element = root.find('d:title', namespace)
    if title_element is not None:
        title = title_element.text
    
    for div in root.findall(".//div"):
        section_text = "/n".join(div.itertext())
        sections.append(section_text)
    
    for figure in root.findall(".//figure"):
        fig_head = figure.find(".//head")
        fig_description = figure.find(".//figDesc")
        fig_info = (fig_head.text if fig_head else "Fig:") + " " 
        + (fig_description.text if fig_description else "unkown description")
        figures.append(fig_info)
    return title + "/n" + "/n".join(sections) + "/n" + "/n".join(figures)


In [11]:
def parse_grobid_xml(file_path):
    with open(file_path, "r") as xml_file:
        doc = grobid_tei_xml.parse_document_xml(xml_file.read())
        title = doc.header.title or ""
        abstract = doc.abstract or ""
        body = doc.body or ""
        index = file_path.split("/")[-1].split(".")[0]
        return f"Paper #: {index}\ntitle:{title}\nAbstract:\n{abstract}\nBody:\n{body}" #title, abstract, body
    # print(json.dumps(doc.to_dict(), indent=2))

In [12]:
xml_dir = "data/xmls"
txt_dir = "data/txts"
for filename in os.listdir("data/xmls"):
    if filename.endswith(".xml"):
        print(filename)
        txt_content = parse_grobid_xml(os.path.join(xml_dir, filename))
        txt_file = os.path.join(txt_dir, f"{os.path.splitext(filename)[0]}.txt")
        with open(txt_file, "w", encoding="utf-8") as f:
            f.write(txt_content)



149-0.xml
149-1.xml


# Create rc_data.csv where sentence is a paragraph

Idea: Each line on the txt is its own paragraph

In [13]:
txt_149_article = 'data/txts\\149-1.txt'
txt_149_sup = 'data/txts\\149-0.txt'

In [14]:
##Identifying the title and paragraph from txt
paragraph_list = []

with open(txt_file, "r", encoding="utf-8") as f:
    txt_list = f.readlines()
for line in txt_list:
    # print(line)
    if line.startswith("title:"):
        title = line[6:]
        continue
    if line.startswith("Paper #") | line.startswith("Abstract:") | line.startswith("Body:"):
        continue
    else:
        paragraph_list.append(line)

    

In [19]:
##Creating a pandas df(paragraph)

# Create the DataFrame
article_149= pd.DataFrame({
    "sentence": paragraph_list,
    "DOI": DOI_149  # This assigns the same DOI to every row
})
article_149["shift"] = article_149["sentence"].shift(1).fillna('')
article_149["passage"] = title+ article_149["shift"] +article_149["sentence"]
article_149 = article_149[['passage', 'sentence', 'DOI']]
article_149.head()

article_149.to_csv("rc_data_149Paragraph.csv", index=False)

In [28]:
paragraph_list

# Step 1: Remove \t and \n from each element and strip extra spaces
cleaned_paragraph_list = [p.replace("\t", "").replace("\n", "").strip() for p in paragraph_list]

# Step 2: Filter out any empty elements
cleaned_paragraph_list = [p.split('. ') for p in cleaned_paragraph_list if p != ""]

# Flatten the list using a list comprehension
sentence_list = [item for sublist in cleaned_paragraph_list for item in sublist]
print(sentence_list)


['All-perovskite tandem solar cells hold the promise of surpassing the efficiency limits of single-junction solar cells  [1] [2] [3]  ; however, until now, the best-performing all-perovskite tandem solar cells have exhibited lower certified efficiency than have single-junction perovskite solar cells  4, 5  ', 'A thick mixed Pb-Sn narrow-bandgap subcell is needed to achieve high photocurrent density in tandem solar cells 6 , yet this is challenging owing to the short carrier diffusion length within Pb-Sn perovskites', 'Here we develop ammonium-cation-passivated Pb-Sn perovskites with long diffusion lengths, enabling subcells that have an absorber thickness of approximately 1.2 μm', 'Molecular dynamics simulations indicate that widely used phenethylammonium cations are only partially adsorbed on the surface defective sites at perovskite crystallization temperatures', 'The passivator adsorption is predicted to be enhanced using 4-trifluoromethyl-phenylammonium (CF3-PA), which exhibits a s

In [29]:
##Creating a pandas df(sentence)

# Create the DataFrame
article_149= pd.DataFrame({
    "sentence": sentence_list,
    "DOI": DOI_149  # This assigns the same DOI to every row
})
article_149["shift"] = article_149["sentence"].shift(1).fillna('')
article_149["passage"] = title+ article_149["shift"] +article_149["sentence"]
article_149 = article_149[['passage', 'sentence', 'DOI']]
article_149.head()

article_149.to_csv("rc_data_149Sentence.csv", index=False)

In [30]:
article_149.head()


Unnamed: 0,passage,sentence,DOI
0,All-perovskite tandem solar cells with improve...,All-perovskite tandem solar cells hold the pro...,https://doi.org/10.1038/s41586-021-04372-8
1,All-perovskite tandem solar cells with improve...,A thick mixed Pb-Sn narrow-bandgap subcell is ...,https://doi.org/10.1038/s41586-021-04372-8
2,All-perovskite tandem solar cells with improve...,Here we develop ammonium-cation-passivated Pb-...,https://doi.org/10.1038/s41586-021-04372-8
3,All-perovskite tandem solar cells with improve...,Molecular dynamics simulations indicate that w...,https://doi.org/10.1038/s41586-021-04372-8
4,All-perovskite tandem solar cells with improve...,The passivator adsorption is predicted to be e...,https://doi.org/10.1038/s41586-021-04372-8
