In [None]:
!pip install pandas pyarrow tqdm nltk sentence-transformers huggingface_hub pyyaml

In [None]:
# Import necessary libraries
import os
import requests
from tqdm import tqdm
from xml.etree import ElementTree as ET
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from huggingface_hub import HfApi, create_repo
import yaml
import nltk

# Download NLTK data
nltk.download('punkt')

In [None]:
# Read the PMIDs from the file
pmid_file_path = './pmids.txt'
with open(pmid_file_path, 'r') as f:
    pmids = f.read().splitlines()

In [None]:
# Function to fetch a PubMed article in XML format by its PMID
def fetch_pubmed_article(pmid):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid}&retmode=xml"
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        return None

In [None]:
# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Fetch articles and process them
pmid_list = []
sentence_list = []
embedding_list = []

for pmid in tqdm(pmids, desc="Downloading PubMed Articles"):
    article_xml = fetch_pubmed_article(pmid)
    if article_xml:
        # Parse the XML to extract the abstract
        root = ET.fromstring(article_xml)
        abstract_texts = root.findall(".//AbstractText")
        abstract = " ".join(abstract_text.text for abstract_text in abstract_texts if abstract_text is not None)
        
        # Tokenize the abstract into sentences
        sentences = sent_tokenize(abstract)
        
        # Generate embeddings for each sentence
        embeddings = model.encode(sentences)
        
        # Store the PMIDs, sentences, and embeddings
        pmid_list.extend([pmid] * len(sentences))
        sentence_list.extend(sentences)
        embedding_list.extend(embeddings)

# Create a DataFrame with PMIDs, sentences, and embeddings
data = {'PMID': pmid_list, 'sentence': sentence_list, 'embedding': embedding_list}
df = pd.DataFrame(data)

# Save the DataFrame to Parquet format
parquet_file_path = './data/sentence_embeddings.parquet'
df.to_parquet(parquet_file_path, engine='pyarrow', index=False)

print(f"Embeddings saved to {parquet_file_path}")

In [None]:
# Generate metadata in venomx format
metadata = {
    'id': 'https://w3id.org/biolink/biolinkml/venomx',
    'name': 'PubMed Sentences',
    'title': 'PubMed Sentence Embeddings',
    'description': 'Embeddings for sentences extracted from a set of PubMed abstracts of interest. These are not embeddings of all of Pubmed. There is a separate embedding for each sentence in each abstract.',
    'license': 'https://opensource.org/licenses/BSD-3-Clause',
    'prefixes': {
        'venomx': 'https://w3id.org/biolink/biolinkml/venomx/',
        'linkml': 'https://w3id.org/linkml/'
    },
}

# Save the metadata to a YAML file
metadata_file_path = './data/metadata.yaml'
with open(metadata_file_path, 'w') as f:
    yaml.dump(metadata, f)

print(f"Metadata saved to {metadata_file_path}")

In [None]:
# Upload to Hugging Face
repo_id = "biomedical-translator/pubmed_sentence_embeddings"
this_notebook = "pubmed_sentence_embeddings.ipynb"
create_repo(repo_id, repo_type="dataset")

api = HfApi()
files_to_upload = [parquet_file_path, metadata_file_path, this_notebook]

for file in files_to_upload:
    api.upload_file(
        path_or_fileobj=file,
        path_in_repo=file,
        repo_id=repo_id,
        repo_type="dataset"
    )

print(f"Files uploaded to Hugging Face in repository: {repo_id}")
