Download the full version of PubMed 2024: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/

In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import warnings
from tqdm.autonotebook import tqdm, trange
import nltk
import time
import xml.etree.ElementTree as ET
# Download NLTK data
nltk.download('punkt')
import gzip
import torch
import os

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /home/kaiwenhe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')
# model.save("./cache")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

cuda


In [3]:
folder = 'pubmed_2024'
xml_processed = 0
for file in os.listdir(folder):
    if file.endswith('.xml.gz'):
        try:
            start_time = time.time()
            file_path = f'{folder}/{file}'
            with gzip.open(file_path, 'rt', encoding='utf-8') as gz_file:
                file_name = file.removesuffix('.xml.gz')
                print(f'Processing file: {file_name}')

                tree = ET.parse(gz_file)
                root = tree.getroot()
                articles = root.findall(".//PubmedArticle")

                print(f'There are {len(articles)} articles.')

                count = 0
                pmid_list = []
                sentence_list = []
                embedding_list = []
                
                for article in articles:
                    pmid = article.findtext(".//PMID")
                    title = article.findtext(".//ArticleTitle")
                    abstract = article.findtext(".//AbstractText")
                    if abstract and title:
                        sentences = sent_tokenize(abstract)
                        sentences.append(title)

                    elif abstract and not title:
                        sentences = sent_tokenize(abstract)

                    elif not abstract and title:
                        sentences = [title]

                    else:
                        warnings.warn(f"PMID {pmid} has no abstract or title")
                        continue

            #         embeddings = model.encode(sentences)
                    embeddings = model.encode(sentences, device=device)
                    pmid_list.extend([pmid] * len(sentences))
                    sentence_list.extend(sentences)
                    embedding_list.extend(embeddings)

                    count += 1
                    if count % 10000 == 0:
                        print(f"in progress {count}/{len(articles)}")

                end_time = time.time()
                runtime = end_time - start_time
                print(f"The procedure took {runtime} seconds to run.")

                # Create a DataFrame with PMIDs, sentences, and embeddings
                data = {'PMID': pmid_list, 'sentence': sentence_list, 'embedding': embedding_list}
                df = pd.DataFrame(data)    

                # Save the DataFrame to Parquet format
                parquet_file_path = f'./data/{file_name}.parquet'
                df.to_parquet(parquet_file_path, engine='pyarrow', index=False)
                xml_processed += 1
                print(f"Embeddings saved to {parquet_file_path}. {xml_processed} files were processed.")
                    
        except Exception as e:
            print(f"An error occurred: {e}")
      
      

Processing file: pubmed24n0019
There are 30000 articles.
in progress 10000/30000
in progress 20000/30000
in progress 30000/30000
The procedure took 156.3284740447998 seconds to run.
Embeddings saved to ./data/pubmed24n0019.parquet. 1 files were processed.
