# Pip Install

In [None]:
#Run this cell once per environment
!pip install https://github.com/ElsevierDev/elsapy/archive/master.zip
!pip install elsapy
!pip install xmltodict
!pip install pubchempy
!pip install fastparquet

# Libraries

In [None]:
# Elsevier Libraries
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
# Libraries for URL access & basic functionality
import json
import csv
import pprint
import requests
import xmltodict
import urllib3
import numpy as np
import pandas as pd
from tqdm import tqdm
from pprint import pprint
from time import sleep
from pathlib import Path
import os
import itertools 
from requests.adapters import HTTPAdapter, Retry
import urllib.parse
# Import warnings library
import warnings
warnings.filterwarnings("ignore")
# xml.etree.ElementTree — The ElementTree XML API
import xml.etree.ElementTree as ET
import os
import pandas as pd
import glob

# API Key & Institution Token

In [None]:
# Append "apikey" and "insttoken" as suggested in the ElsSearch document into a config file
CONFIG = {"apikey": "YOUR_API_KEY", 
          "insttoken": "YOUR_INST_TOKEN"}
API_KEY = CONFIG["apikey"]
INST_TOKEN = CONFIG["insttoken"]
client = ElsClient(API_KEY)
client.inst_token = INST_TOKEN

# Maximum of 6000 searches are returned on calling the ElsSearch API
# Create a num array of 0 to 5900 spaces 100 apart per page.
# Each page of Search returns 100 entries until 6000 is reached.
NUM = np.linspace(0, 5900, 60, dtype = int)

QUERY_PATH = ("D:/MS_Thesis/API_Queries/")
os.makedirs(QUERY_PATH, exist_ok = True) 
os.path.exists(QUERY_PATH)

# Lists for Queries

## Plastic polymers, plastic type, recycling technologies

In [None]:
# Polymer types, sorted by recycling category
category1 = ["polyethylene terephthalate", "pet"]
category2 = ["high-density polyethylene", "high density polyethylene", "hdpe"]
category3 = ["polyvinyl chloride", "pvc"]
category4 = ["low-density polyethylene", "low density polyethylene", "ldpe"]
category5 = ["polypropylene", "pp"]
category6 = ["polystyrene", "styrene", "PS"]
category7 = ["polyurethane","PUR", "acrylonitrile butadiene styrene", "ABS", 
             "polyacrylate", "acrylic", "polycarbonate", "PC", "nylon",
             "polylactic acid", "biodegradable plastic", "bio-plastic", 
             "bioplastic", "bioplastics", "polymethyl methacrylate", "PMMA", 
             "polytetrafluoroethylene", "teflon", "polyimide", "polysulfone", 
             "polyethersulfone", "polyarylsulfone", "polyphenylene sulfide"]
# Plastic product types
plastic_types = ["films", "film", "bottle", "bottles", "label", "labels", 
                 "container", "containers", "wrap", "wrapper", "bag", "bags",
                 "multi-layer", "multi-layers", "multilayer", "multilayers",
                 "multimaterial", "multimaterials", "multi-materials",
                 "multi-material", "sack", "sacks", "cap", "caps", "lid", "lids",
                 "coating", "coatings", "basket", "baskets", "tray", "trays"]
# All polymer types into a single list
all_polymers = list(itertools.chain(category1, category2, category3, category4, 
                                    category5, category6, category7))

#  Functions

In [None]:
def XML_DOI(link):
    """
    Fetch XML metadata for given DOI URL using Elsevier API & save it to disk.

    Parameters
    ----------
    link : str
        Full URL of DOI endpoint that returns XML metadata.

    Returns
    -------
    None
        Writes XML response bytes to loca lfile named "doi.xml"
    """
    
    # Header dictionary to authenticate & request XML from Elsevier API
    headers_dict = {"X-ELS-APIKey": "YOUR_API_KEY", 
                    "X-ELS-Insttoken": "YOUR_INST_TOKEN", 
                    "Accept": "application/xml"}
    
    # Send GET request to DOI link
    x = requests.get(link, headers = headers_dict) # x takes response of the HTTP request, passes link

    # Save raw XML content to local file
    with open("doi.xml", "wb") as f:
        f.write(x.content)

# QUERY: Plastic {plastic_type} (Life Cycle Analysis OR Life Cycle Assessment)

In [None]:
for word in plastic_types:
    # Stores the URLs (links) generated for querying the API as strings
    # Used to make API requests in subsequent loop
    # Reinitialized for each iteration in plastic_type_list (i.e., for each word in plastic_type_list)
    # Loop to iterate over all papers
    # UPI query can take many arguments (ex. start, count, & query)
    link_list = []   
    
    # Build raw query with Boolean logic, see Elsevier documentation!
    raw_query1 = "'plastic' AND '{}' AND ('life cycle analysis' OR 'life cycle assessment')".format(word)
    encoded_query1 = urllib.parse.quote(raw_query1)
    
    # Loop to create and store the links for each word in plastic_type_list:
    for i in range(0, 60):
        start = "https://api.elsevier.com/content/search/sciencedirect?"
        count = "start=" + str(NUM[i]) + "&count=100"
        query_part = f"&query={encoded_query1}"
        auth = f"&apiKey={API_KEY}&insttoken={INST_TOKEN}"
        full_url = start + count + query_part + auth
        link_list.append(full_url)
        
    # FOR DEBUGGING
    test_url = link_list[0]
    response = requests.get(test_url)
    data = response.json()
    
    # Check for errors
    if "service-error" in data:
        print(f"Error for '{word}':", data["service-error"]["status"]["statusText"])
    else:
        total_results = int(data.get("search-results", {}).get("opensearch:totalResults", 0))
        print(f"'{word}': {total_results} results found")
    
    # Loop to fetch XML data for each link in link_list:
    for j in tqdm(range(0, 60)): # Provides a visualization of the loop's progress via a progress bar
        
        # Debugging
        response = requests.get(link_list[j])
        # print("Querying:", link_list[j])
        # print("API Response:", response.text[:500])  # Print first 500 characters to avoid huge output

        xmlfile = XML_DOI(link_list[j])  # Continue with function...
        # Read the data
        tree = ET.parse(r"doi.xml")
        root = tree.getroot()
        for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
            info_dict = {} # Dictionary to store the information from the queries (metadata)
            """
            CHECK CODE
            url = entry.find('{http://prismstandard.org/namespaces/basic/2.0/}url').text
            title = entry.find('{http://purl.org/dc/elements/1.1/}title').text
            pub_name = entry.find('{http://prismstandard.org/namespaces/basic/2.0/}publicationName').text
            doi = entry.find('{http://prismstandard.org/namespaces/basic/2.0/}doi').text
            #description = entry.find('{http://purl.org/dc/elements/1.1/}description').text
            print(url, title, pub_name, doi)
            print('\n')
            """
            # Extract the metadata:
            info_dict["URL"] = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}url").text
            info_dict["Title"] = entry.find("{http://purl.org/dc/elements/1.1/}title").text
            info_dict["Pub_Name"] = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}publicationName").text
            doi = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}doi")
            if doi is None:
                info_dict["DOI"] = None
            else:
                info_dict["DOI"] = doi.text
            # Write the URL, title, publication name, & DOI to info_list, i.e., metadata storage
            info_list.append(info_dict) 
            
    print("Total number of papers:",len(info_list))

    DOI =  [] # Initializing empty list to store DOI's
    Title = [] # Initializing empty list to store titles
    Pub_name = [] # Initializing empty list to store publication names
    
    # Loop over info_list to extract DOI, Title, & publication name & store them in previously initialized lists:
    for i in range(len(info_list)): 
        DOI.append(info_list[i]["DOI"])
        Title.append(info_list[i]["Title"])
        Pub_name.append(info_list[i]["Pub_Name"])
    
    # Define the data frame to store the metadata for each query in plastic_type_list:
    df =  pd.DataFrame()
    df["Title"] =  Title
    df["Pub_name"] = Pub_name
    df["DOI"] = DOI
    
    # Create string1 and string2 to join doi with institoken to make a single URL
    string1 = "https://api.elsevier.com/content/article/doi/" # Base URL to access articles via DOI
    string2 = "?apiKey=YOUR_API_KEY&insttoken=YOUR_INST_TOKEN" # Query string w/ API key & institution token
    
    # New column 'Link' that joins string 1 and string 2 to each DOI:
    df["Link"] = df["DOI"].apply(lambda x: string1 + str(x) + string2)
    
    df.to_parquet(f"{QUERY_PATH}/Plastic_Product_LCA/plastic_{word}_LCA_links.gzip".format(word))

# QUERY: Recycling Technology OR Recycle Plastic {plastic_type}

In [None]:
for word in plastic_types:
    link_list = []     
    # Build raw query with Boolean logic, see Elsevier documentation!
    raw_query2 = "('recycling technology' OR 'recycle') AND 'plastic' AND '{}'".format(word)
    encoded_query2 = urllib.parse.quote(raw_query2)
    
    for i in range(0, 60):
        start = "https://api.elsevier.com/content/search/sciencedirect?"
        count = "start=" + str(NUM[i]) + "&count=100"
        query_part = f"&query={encoded_query2}"
        auth = f"&apiKey={API_KEY}&insttoken={INST_TOKEN}"
        full_url = start + count + query_part + auth
        link_list.append(full_url)
    
    # Debugging
    test_url = link_list[0]
    response = requests.get(test_url)
    data = response.json()
    
    # Check for errors
    if "service-error" in data:
        print(f"Error for '{word}':", data["service-error"]["status"]["statusText"])
    else:
        total_results = int(data.get("search-results", {}).get("opensearch:totalResults", 0))
        print(f"'{word}': {total_results} results found")
    
    # Loop to fetch XML data for each link in link_list:
    for j in tqdm(range(0, 60)):
        
        # Debugging
        response = requests.get(link_list[j])
        #print("Querying:", link_list[j])
        #print("API Response:", response.text[:500])

        xmlfile = XML_DOI(link_list[j])
        # Read the data
        tree = ET.parse(r"doi.xml")
        root = tree.getroot()
        
        for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
            info_dict = {} # Dictionary to store the information from the queries (metadata)
            # Extract the metadata:
            info_dict["URL"] = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}url").text
            info_dict["Title"] = entry.find("{http://purl.org/dc/elements/1.1/}title").text
            info_dict["Pub_Name"] = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}publicationName").text
            doi = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}doi")
            if doi is None:
                info_dict["DOI"] = None
            else:
                info_dict["DOI"] = doi.text
            # Write the URL, title, publication name, & DOI to info_list, i.e., metadata storage
            info_list.append(info_dict) 
    print("Total number of papers:",len(info_list))

    DOI =  [] # Initializing empty list to store DOI's
    Title = [] # Initializing empty list to store titles
    Pub_name = [] # Initializing empty list to store publication names
    
    # Loop over info_list to extract DOI, Title, & publication name & store them in previously initialized lists:
    for i in range(len(info_list)): 
        DOI.append(info_list[i]["DOI"])
        Title.append(info_list[i]["Title"])
        Pub_name.append(info_list[i]["Pub_Name"])
    
    # Define the data frame to store the metadata for each query in plastic_type_list:
    df =  pd.DataFrame()
    df["Title"] =  Title
    df["Pub_name"] = Pub_name
    df["DOI"] = DOI
    
    # Create string1 and string2 to join doi with institoken to make a single URL
    string1 = "https://api.elsevier.com/content/article/doi/" # Base URL to access articles via DOI
    string2 = "?apiKey=YOUR_API_KEY&insttoken=YOUR_INST_TOKEN" # Query string w/ API key & institution token
    
    # New column 'Link' that joins string 1 and string 2 to each DOI:
    df["Link"] = df["DOI"].apply(lambda x: string1 + str(x) + string2)
    
    df.to_parquet(f"{QUERY_PATH}/Recycle_Plastic_Products/recycle_plastic_{word}_links.gzip".format(word))

# QUERY: {polymer type} Life Cycle Analysis OR Life Cycle Assessment

In [None]:
for word in all_polymers:
    link_list = []     
    raw_query3 = "'{}' AND ('life cycle analysis' OR 'life cycle assessment')".format(word)
    encoded_query3 = urllib.parse.quote(raw_query3)
    info_list = []

    for i in range(0, 60):
        start = "https://api.elsevier.com/content/search/sciencedirect?"
        count = "start=" + str(NUM[i]) + "&count=100"
        query_part = f"&query={encoded_query3}"
        auth = f"&apiKey={API_KEY}&insttoken={INST_TOKEN}"
        full_url = start + count + query_part + auth
        link_list.append(full_url)
        
    # Debugging
    test_url = link_list[0]
    response = requests.get(test_url)
    data = response.json()
    # Check for errors
    if "service-error" in data:
        print(f"Error for '{word}':", data["service-error"]["status"]["statusText"])
    else:
        total_results = int(data.get("search-results", {}).get("opensearch:totalResults", 0))
        print(f"'{word}': {total_results} results found")
    
    # Loop to fetch XML data for each link in link_list:
    for j in tqdm(range(0, 60)): #Provides a visualization of the loop's progress via a progress bar
        
        # Debugging
        response = requests.get(link_list[j])
        #print("Querying:", link_list[j])
        #print("API Response:", response.text[:500])  # Print first 500 characters to avoid huge output
        
        xmlfile = XML_DOI(link_list[j])
        # Read the data
        tree = ET.parse(r"doi.xml")
        root = tree.getroot()
        for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
            info_dict = {} # Dictionary to store the information from the queries (metadata)
            # Extracting the metadata:
            # Extracting the metadata:
            info_dict["URL"] = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}url").text
            info_dict["Title"] = entry.find("{http://purl.org/dc/elements/1.1/}title").text
            info_dict["Pub_Name"] = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}publicationName").text
            doi = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}doi")
            if doi is None:
                info_dict["DOI"] = None
            else:
                info_dict["DOI"] = doi.text
            info_list.append(info_dict) # Writing the URL, title, publication name, & DOI to info_list, i.e., metadata storage
    print("Total number of papers:",len(info_list))

    DOI =  [] # Initializing empty list to store DOI's
    Title = [] # Initializing empty list to store titles
    Pub_name = [] # Initializing empty list to store publication names
    
    # Loop over info_list to extract DOI, Title, & publication name 
    # & store them in previously initialized lists:
    for i in range(len(info_list)): 
        DOI.append(info_list[i]["DOI"])
        Title.append(info_list[i]["Title"])
        Pub_name.append(info_list[i]["Pub_Name"])
    
    # Define the data frame to store the metadata for each query in plastic_type_list:
    df =  pd.DataFrame()
    df["Title"] =  Title
    df["Pub_name"] = Pub_name
    df["DOI"] = DOI
    
    # Create string1 and string2 to join doi with institoken to make a single URL
    string1 = "https://api.elsevier.com/content/article/doi/" # Base URL to access articles via DOI
    string2 = "?apiKey=YOUR_API_KEY&insttoken=YOUR_INST_TOKEN" # Query string w/ API key & institution token
    
    # New column 'Link' that joins string 1 and string 2 to each DOI:
    df["Link"] = df["DOI"].apply(lambda x: string1 + str(x) + string2)
    df.to_parquet(f"{QUERY_PATH}/Polymer_LCA/{word}_LCA_links.gzip".format(word))

# QUERY: Recycling Technology OR Recycle {polymer}

In [None]:
for word in all_polymers:
    link_list = []     
    raw_query4 ="('recycling technology' OR 'recycle') AND '{}'".format(word)
    encoded_query4 = urllib.parse.quote(raw_query4)
    info_list = []

    for i in range(0, 60):
        start = "https://api.elsevier.com/content/search/sciencedirect?"
        count = "start=" + str(NUM[i]) + "&count=100"
        query_part = f"&query={encoded_query4}"
        auth = f"&apiKey={API_KEY}&insttoken={INST_TOKEN}"
        full_url = start + count + query_part + auth
        link_list.append(full_url)
        
    # Debugging
    test_url = link_list[0]
    response = requests.get(test_url)
    data = response.json()
    # Check for errors
    if "service-error" in data:
        print(f"Error for '{word}':", data["service-error"]["status"]["statusText"])
    else:
        total_results = int(data.get("search-results", {}).get("opensearch:totalResults", 0))
        print(f"'{word}': {total_results} results found")
    
    # Loop to fetch XML data for each link in link_list:
    for j in tqdm(range(0, 60)): #Provides a visualization of the loop's progress via a progress bar
        
        # Debugging
        response = requests.get(link_list[j])
        #print("Querying:", link_list[j])
        #print("API Response:", response.text[:500])  # Print first 500 characters to avoid huge output

        xmlfile = XML_DOI(link_list[j])
        # Read the data
        tree = ET.parse(r"doi.xml")
        root = tree.getroot()
        for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
            info_dict = {} # Dictionary to store the information from the queries (metadata)
            # Extracting the metadata:
            info_dict["URL"] = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}url").text
            info_dict["Title"] = entry.find("{http://purl.org/dc/elements/1.1/}title").text
            info_dict["Pub_Name"] = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}publicationName").text
            doi = entry.find("{http://prismstandard.org/namespaces/basic/2.0/}doi")
            if doi is None:
                info_dict["DOI"] = None
            else:
                info_dict["DOI"] = doi.text
            # Write the URL, title, publication name, & DOI to info_list, i.e., metadata storage
            info_list.append(info_dict) 
    print("Total number of papers:",len(info_list))

    DOI =  [] # Initializing empty list to store DOI's
    Title = [] # Initializing empty list to store titles
    Pub_name = [] # Initializing empty list to store publication names
    
    # Loop over info_list to extract DOI, Title, & publication name 
    # & store them in previously initialized lists:
    for i in range(len(info_list)): 
        DOI.append(info_list[i]["DOI"])
        Title.append(info_list[i]["Title"])
        Pub_name.append(info_list[i]["Pub_Name"])
    
    # Define the data frame to store the metadata for each query in plastic_type_list:
    df =  pd.DataFrame()
    df["Title"] =  Title
    df["Pub_name"] = Pub_name
    df["DOI"] = DOI

    string1 = "https://api.elsevier.com/content/article/doi/" # Base URL to access articles via DOI
    string2 = "?apiKey=YOUR_API_KEY&insttoken=YOUR_INST_TOKEN" # Query string w/ API key & institution token
    
    # New column 'Link' that joins string 1 and string 2 to each DOI:
    df['Link'] = df['DOI'].apply(lambda x: string1 + str(x) + string2)
    df.to_parquet(f"{QUERY_PATH}/Recycle_Polymers/recycle_{word}_links.gzip".format(word))

# Combining queries into one dataframe

In [None]:
# Query_files = .gzip file that contains all papers produced from a particular query
query_files = os.listdir(QUERY_PATH)
len(query_files)

In [None]:
# Find all .gzip files recursively in subdirectories
gzip_files = glob.glob(os.path.join(QUERY_PATH, "**/*.gzip"), recursive = True)

# Read and concatenate all DataFrames
df_list = [pd.read_parquet(file) for file in gzip_files]
combined_df = pd.concat(df_list, ignore_index = True)

# Before saving to csv, clean up dataframe
combined_df.dropna(subset=["Title", "DOI",], inplace = True) # Rows without abstracts
combined_df.drop_duplicates(subset=["DOI", "Title"], keep = "first", inplace = True) # Dropping duplicate DOI entries

# Save the combined DataFrame as a single gzip file
ALL_QUERIES_PATH = f"{QUERY_DIR}/All_Queries.gzip"
combined_df.to_parquet(ALL_QUERIES_PATH, compression = "gzip", engine = "pyarrow")

print(f"Combined {len(gzip_files)} gzip files from subdirectories into {ALL_QUERIES_PATH}")

# Abstract Extraction

### Either use the code below or use batched_abstracts.py with the sbatch file submit_abstracts.sh for faster processing. 

In [None]:
df = pd.read_parquet(ALL_QUERIES_PATH)
doi_to_category = df.set_index("DOI")["Plastic_Category"].to_dict()
doi_series = df_queries["DOI"].reset_index(drop = True)  # Sequential index
max_index = len(doi_series) - 1  # Maximum valid index

In [None]:
%%time
# Lists to store extracted data
list_abstract = []
list_doi = []
list_title = []
list_date = []
list_journal = []
list_category = []  # New list for plastic category
# Run in small batches in case of error, e.g. 0-1000 at a time.
start = 0 
end = 5
for i in tqdm(range(start, end + 1)):
    try:
        current_doi = doi_series[i]
        doi_doc = FullDoc(doi=current_doi)
        if doi_doc.read(client):
            abstract = doi_doc._data["coredata"]["dc:description"]
            title = doi_doc.title
            date = doi_doc._data["coredata"]["prism:coverDisplayDate"]
            journal = doi_doc._data["coredata"]["prism:publicationName"]
            
            list_abstract.append(abstract)
            list_doi.append(current_doi)
            list_title.append(title)
            list_date.append(date)
            list_journal.append(journal)
            
            plastic_category = doi_to_category.get(current_doi, "Unknown")
            list_category.append(plastic_category)

        else:
            print(f"Operation failed for DOI: {current_doi}")
            # Append placeholders
            list_abstract.append("None")
            list_doi.append(current_doi)
            list_title.append("None")
            list_date.append("None")
            list_journal.append("None")
            list_category.append("None")

    except Exception as e:
        # Handle cases where i might be invalid (though adjusted)
        print(f"Error at index {i}: {e}")
        list_abstract.append("None")
        list_doi.append(doi_series[i] if i <= max_index else "Invalid DOI")
        list_title.append("None")
        list_date.append("None")
        list_journal.append("None")
        list_category.append("None")

# Create and save the extracted data
df_abstracts = pd.DataFrame({
    "DOI": list_doi,
    "Title": list_title,
    "Abstract": list_abstract,
    "Date": list_date,
    "Journal": list_journal,
    "Plastic Category": list_category})

df_abstracts = df_abstracts.astype({
    "DOI": "str",
    "Title": "str",
    "Abstract": "str",
    "Date": "str",
    "Journal": "str",
    "Plastic Category": "str"
})

# Save abstracts
ABSTRACT_DIR = f"{QUERY_PATH}/Abstracts"
os.makedirs(ABSTRACT_DIR, exist_ok = True)
ABSTRACT_PATH = f"{ABSTRACT_DIR}/Abstracts_{start}_{end}.gzip"
df_abstracts.to_parquet(ABSTRACT_PATH, compression = "gzip", engine = "pyarrow")
print(f"Saved extracted abstracts from {start} to {end} in {ABSTRACT_PATH}")

In [None]:
all_abstracts_filename = "All_Abstracts.gzip"
output_path = os.path.join(ABSTRACT_DIR, all_abstracts_filename)

# Get all .gzip files in the directory, excluding the output file
input_files = glob.glob(os.path.join(ABSTRACT_DIR, "*.gzip"))
input_files = [f for f in input_files if f != output_path]

# Sort files to ensure order (adjust if needed)
input_files.sort()

# Concatenate the files
with open(output_path, "wb") as out_file:
    for file_path in input_files:
        with open(file_path, "rb") as in_file:
            out_file.write(in_file.read())

print(f"Successfully concatenated {len(input_files)} files into {output_path}")

In [None]:
ABSTRACT_DIR = f"{QUERY_PATH}/Abstracts"
os.makedirs(ABSTRACT_DIR, exist_ok = True)
all_abstracts_path = f"{ABSTRACT_DIR}\All_Abstracts2.parquet.gzip"

# Get all .gzip files (assuming they are Parquet files)
files = glob.glob(f"{ABSTRACT_DIR}\*.gzip")

# Exclude output file if it already exists
files = [f for f in files if f != all_abstracts_path]

# Read and concatenate Parquet files
dfs = []
for f in files:
    try:
        df_part = pd.read_parquet(f)  # Read as Parquet
        dfs.append(df_part)
        print(f"Successfully read: {f}")
    except Exception as e:
        print(f"FAILED TO READ {f}: {str(e)}")

if not dfs:
    raise ValueError("No valid files to concatenate.")

df = pd.concat(dfs, ignore_index = True)
df.to_parquet(all_abstracts_path, compression = "gzip")
print(f"Merged {len(dfs)} files into {all_abstracts_path}")