In [27]:
!pip install pandas pyarrow tqdm nltk sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [28]:
# Import necessary libraries
import os
import requests
from tqdm import tqdm
from xml.etree import ElementTree as ET
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer

# Download NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/jtr4v/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
# Read the PMIDs from the file
pmid_file_path = './pmids.txt'
with open(pmid_file_path, 'r') as f:
    pmids = f.read().splitlines()

In [30]:
# Function to fetch a PubMed article in XML format by its PMID
def fetch_pubmed_article(pmid):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid}&retmode=xml"
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        return None

In [31]:
# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [32]:
# Fetch articles and process them
pmid_list = []
sentence_list = []
embedding_list = []

for pmid in tqdm(pmids[0:2], desc="Downloading PubMed Articles"):
    article_xml = fetch_pubmed_article(pmid)
    if article_xml:
        # Parse the XML to extract the abstract
        root = ET.fromstring(article_xml)
        abstract_texts = root.findall(".//AbstractText")
        abstract = " ".join(abstract_text.text for abstract_text in abstract_texts if abstract_text is not None)
        
        # Tokenize the abstract into sentences
        sentences = sent_tokenize(abstract)
        
        # Generate embeddings for each sentence
        embeddings = model.encode(sentences)
        
        # Store the PMIDs, sentences, and embeddings
        pmid_list.extend([pmid] * len(sentences))
        sentence_list.extend(sentences)
        embedding_list.extend(embeddings)

# Create a DataFrame with PMIDs, sentences, and embeddings
data = {'PMID': pmid_list, 'sentence': sentence_list, 'embedding': embedding_list}
df = pd.DataFrame(data)

# Save the DataFrame to Parquet format
parquet_file_path = './data/sentence_embeddings.parquet'
df.to_parquet(parquet_file_path, engine='pyarrow', index=False)

print(f"Embeddings saved to {parquet_file_path}")

Downloading PubMed Articles: 100%|██████████| 2/2 [00:00<00:00,  3.94it/s]

Embeddings saved to ./data/sentence_embeddings.parquet



