In [10]:
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import numpy as np
import os
import gzip
import xml.etree.ElementTree as ET
from tqdm import tqdm
import json
import ftplib    

In [11]:
# Download necessary NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/jtr4v/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
# Define function to extract abstracts and PMIDs
def extract_abstracts_and_pmids(xml_file):
    abstracts_pmids = []
    
    with gzip.open(xml_file, 'rb') as f:
        tree = ET.parse(f)
        root = tree.getroot()
        
        for article in root.findall('.//PubmedArticle'):
            pmid = article.find('.//PMID').text
            abstract = article.find('.//Abstract/AbstractText')
            if pmid and abstract is not None and abstract.text:
                abstracts_pmids.append((pmid, abstract.text.strip()))
    
    return abstracts_pmids

In [13]:
# Download PubMed baseline data
def download_pubmed_baseline(num_files=1):
    ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
    ftp.login()
    ftp.cwd("/pubmed/baseline/")
    
    files = ftp.nlst()
    xml_files = [f for f in files if f.endswith(".xml.gz")][:num_files]
    
    os.makedirs("pubmed_data", exist_ok=True)
    
    for file in tqdm(xml_files, desc="Downloading files"):
        local_filename = os.path.join("pubmed_data", file)
        with open(local_filename, "wb") as f:
            ftp.retrbinary(f"RETR {file}", f.write)
    
    ftp.quit()

# Download 1 file for demonstration purposes
download_pubmed_baseline(num_files=1)

Downloading files: 100%|██████████| 1/1 [00:02<00:00,  2.27s/it]


In [14]:
# Extract all abstracts and PMIDs
all_abstracts_pmids = []
for file in os.listdir("pubmed_data"):
    if file.endswith(".xml.gz"):
        file_path = os.path.join("pubmed_data", file)
        all_abstracts_pmids.extend(extract_abstracts_and_pmids(file_path))

print(f"Total abstracts extracted: {len(all_abstracts_pmids)}")

Total abstracts extracted: 13415


In [15]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [16]:
# Tokenize abstracts into sentences and create embeddings
embeddings_with_pmids = []
for pmid, abstract in tqdm(all_abstracts_pmids, desc="Embedding sentences"):
    sentences = sent_tokenize(abstract)
    sentence_embeddings = model.encode(sentences)
    for sentence, embedding in zip(sentences, sentence_embeddings):
        embeddings_with_pmids.append({'pmid': pmid, 'sentence': sentence, 'embedding': embedding.tolist()})    

Embedding sentences:   8%|▊         | 1124/13415 [02:42<29:42,  6.90it/s]  


KeyboardInterrupt: 

In [None]:
# Save the sentence embeddings with PMIDs to a JSON file
with open("pubmed_sentence_embeddings_with_pmids.json", 'w') as f:
    json.dump(embeddings_with_pmids, f)

print("Sentence embeddings with PMIDs saved to 'pubmed_sentence_embeddings_with_pmids.json'")

In [None]:
# Import necessary libraries
import os
import requests
from tqdm import tqdm
from xml.etree import ElementTree as ET

# Read the PMIDs from the file
pmid_file_path = './pmids.txt'
with open(pmid_file_path, 'r') as f:
    pmids = f.read().splitlines()


In [None]:
# Function to fetch a PubMed article in XML format by its PMID
def fetch_pubmed_article(pmid):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid}&retmode=xml"
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        return None


In [None]:
# Fetch articles and save to a list
articles = []
for pmid in tqdm(pmids, desc="Downloading PubMed Articles"):
    article_xml = fetch_pubmed_article(pmid)
    if article_xml:
        articles.append(article_xml)

# Save the fetched articles to a file (optional)
output_dir = './pubmed_articles'
os.makedirs(output_dir, exist_ok=True)
for pmid, article_xml in zip(pmids, articles):
    with open(os.path.join(output_dir, f'{pmid}.xml'), 'wb') as f:
        f.write(article_xml)

# Confirm the number of downloaded articles
print(f"Downloaded {len(articles)} articles.")

# You can add more processing steps below as needed