In [1]:
import grobid_tei_xml
import pandas as pd
import os

In [8]:
def parse_grobid_xml(file_path):
    with open(file_path, "r", encoding="utf-8") as xml_file:
        try:
            doc = grobid_tei_xml.parse_document_xml(xml_file.read())
        except:
            return ""
        title = "Title: " + doc.header.title if doc.header.title else ""
        abstract = doc.abstract or ""
        body = doc.body or ""
        index = file_path.split("/")[-1].split(".")[0]
        return f"Paper #: {index}\n{title}\n{abstract}\n{body}\n" #title, abstract, body

In [9]:
def convert_grobid_xml_to_csv(input_dir, output_file, previous_batch_path=None):
    crossref_df = pd.read_csv("data/crossref_data.csv")
    crossref_df["text"] = None
    for index, _ in crossref_df.iterrows():
        i = 1
        added_count = 0
        paper_text = ""
        while added_count < 3 and i <= 6: #prevents more than 3 files from being added, up to 6 accounts for grobid failures to generate xml
            file_path = f"{input_dir}/{index}_{i}.xml"
            if os.path.exists(file_path):
                paper_text += parse_grobid_xml(file_path)
                added_count += 1
            i += 1
        crossref_df.at[index, "text"] = paper_text
    output_df = crossref_df[crossref_df["text"].str.len() > 0]
    unique_dois = output_df.groupby('DOI').first()
    if previous_batch_path is not None:
        previous_batch = pd.read_csv(previous_batch_path)
        unique_dois = pd.concat([previous_batch, unique_dois], ignore_index=True)
    unique_dois.to_csv(output_file)
    return unique_dois

In [10]:
output_df = convert_grobid_xml_to_csv("data/xmls", "data/scraped_papers_final.csv", "data/scraped_papers.csv")

In [11]:
df = pd.read_csv("data/scraped_papers_final.csv")
df

Unnamed: 0.1,Unnamed: 0,DOI,URL,year,title,publisher,text
0,0,10.1002/9781119578826.ch15,https://doi.org/10.1002/9781119578826.ch15,2024,['Quantum Dots Solar Cells'],Wiley,Paper #: 7386_1\nTitle: Role of carrier deloca...
1,1,10.1002/9781119578826.ch16,https://doi.org/10.1002/9781119578826.ch16,2024,['Singlet Fission for Solar Cells'],Wiley,Paper #: 7742_1\nTitle: Perovskite-Based Tande...
2,2,10.1002/9781119578826.ch27,https://doi.org/10.1002/9781119578826.ch27,2024,['Deployment of<scp>Solar</scp>Photovoltaic Sy...,Wiley,Paper #: 9299_1\nTitle: Inhomogeneous Halide A...
3,3,10.1002/9781119578826.ch6,https://doi.org/10.1002/9781119578826.ch6,2024,['Tunnel Oxide Passivated Contact (<scp>TOPCon...,Wiley,Paper #: 7949_1\nTitle: Efficiency assessment ...
4,4,10.1002/9781119578826.ch8,https://doi.org/10.1002/9781119578826.ch8,2024,['Update on Non‐silicon‐based Low‐Temperature ...,Wiley,Paper #: 8891_1\nTitle: Author Correction: Con...
...,...,...,...,...,...,...,...
10259,10259,,https://doi.org/10.7836/kses.2023.43.5.043,2023,['Performance Evaluation of LSTM Based Solar I...,The Korean Solar Energy Society,Paper #: 16412_1\nTitle: Supporting Informatio...
10260,10260,,https://doi.org/10.7836/kses.2023.43.6.013,2024,['A Methodology for Calculating the Limit Capa...,The Korean Solar Energy Society,Paper #: 16980_1\nTitle: Nanosecond Laser Pass...
10261,10261,,https://doi.org/10.7836/kses.2023.43.6.067,2024,['Prediction Model For Colored BIPV Modules'],The Korean Solar Energy Society,Paper #: 19568_1\nTitle: Improve Perovskite So...
10262,10262,,https://doi.org/10.7836/kses.2023.43.6.087,2024,['Transformer-Based Photovoltaic Power Predict...,The Korean Solar Energy Society,Paper #: 19781_1\nTitle: Electron-hole diffusi...
