**Imports**

In [None]:
!pip install transformers==4.22
!pip install sacremoses
!pip install biopython
!pip install sentence-transformers
!pip install openai

**BERTMeSH**

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer_BERTMeSH = AutoTokenizer.from_pretrained("osanseviero/test_model_bertmesh")
model_BERTMeSH = AutoModel.from_pretrained("osanseviero/test_model_bertmesh", trust_remote_code=True)

**Get Documents from Pubmed**

In [None]:
from Bio import Entrez

def search_pubmed(query, num_results=5):
    Entrez.email = "leandra.budau@torontomu.ca"

    # Search query in Pubmed database
    handle = Entrez.esearch(db="pubmed", term=query, retmax=num_results)
    record = Entrez.read(handle)
    handle.close()

    # Retrieve the list of PubMed IDs (PMID)
    pmids = record["IdList"]

    return pmids

**Get Title from Article based on PMID**

In [None]:
def pubmed_data(pmid):
    Entrez.email = "leandra.budau@torontomu.ca"

    # Search for document in Pubmed Database
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
        record = Entrez.read(handle)
    except Exception as e:
        print(f"Error fetching data: {e}")
        return None

    # Get title and abstract from pubmed article
    try:
      articles = record['PubmedArticle']

    # Check if there is at least one article
      if articles:
          medline_citation = articles[0].get('MedlineCitation', {})
          article = medline_citation.get('Article', {})
          title = article.get('ArticleTitle', 'Title not available')
      else:
          title = ''

    except (KeyError, IndexError) as e:
      print(f"Error: {e}")
      print("Could not retrieve article information.")

    return title

**Sending Prompts to ChatGPT**

In [None]:
from openai import OpenAI

client = OpenAI(
    api_key="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",
)

def get_completion(prompt, model="gpt-3.5-turbo"):
  messages = [{"role": "user", "content": prompt}]
  response = client.chat.completions.create(
    model=model,
    messages=messages,
    temperature=0,
  )
  return response.choices[0].message.content

**Splitting Queries by Operators**

In [None]:
import re

def split_query_to_frags(query):

  fragments = re.split(r'\bAND\b', query, flags=re.IGNORECASE)
  fragments = [fragment.strip() for fragment in fragments]
  return fragments

def split_frags_to_terms(fragments):
  terms = []

  for i in range(len(fragments)):
    fragments[i] = fragments[i].replace('(', '')
    fragments[i] = fragments[i].replace(')', '')
    fragments[i] = fragments[i].replace('"', '')
    fragments[i] = re.split(r'\bOR\b', fragments[i], flags=re.IGNORECASE)
    fragments[i] = [term.strip() for term in fragments[i]]
    terms.append(fragments[i])

  return(terms)

**Get MeSH Term Definitions**

In [None]:
import urllib.request
import json
import requests
from bs4 import BeautifulSoup

def get_mesh_id (mesh_term):
  search_url = f'http://eutils.ncbi.nlm.nih.gov/entrez//eutils/esearch.fcgi/' + \
                f'?db=mesh' + \
                f'&term=' + \
                f'' + mesh_term + ''+\
                f'&retmode=json' + \
                f'&sort=relevance' + \
                f'&retmax=5'


  link_list = urllib.request.urlopen(search_url).read().decode('utf-8')
  summary = json.loads(link_list)
  if 'esearchresult' in summary and 'idlist' in summary['esearchresult']:
    if len(summary['esearchresult']['idlist']) == 0:
      return ''
    else:
      return (summary['esearchresult']['idlist'][0])
  else:
    return ''

def get_mesh_term_description(mesh_term):
    mesh_id = get_mesh_id(mesh_term)
    url = f"https://www.ncbi.nlm.nih.gov/mesh/{mesh_id}"

    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        definition_element = soup.find(class_='mesh_ds_scope_note')
        if definition_element:
            return definition_element.text.strip()
        else:
            return "No definition found for the given MeSH term ID"
    else:
        return f"Error: {response.status_code}"


**Semantic Similarity**

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_similarity(sentence1, definitions):
  embedding1 = model.encode(sentence1, convert_to_tensor=True)
  best_cosine_score = 0
  best_definition = 0

  for i in range(len(definitions)):
    embedding2 = model.encode(definitions[i], convert_to_tensor=True)
    cosine_score = util.cos_sim(embedding1, embedding2)
    if (cosine_score.item() > best_cosine_score):
      best_cosine_score = cosine_score.item()
      best_definition = i

  return best_definition

**Read from Test Files and Export Results**

In [None]:
import os

def preprocess():
  folder_path = '/content/titles/'
  topic_numbers = []
  titles = []

  try:
      files_in_folder = os.listdir(folder_path)

      for i, filename in enumerate(files_in_folder):
          file_path = os.path.join(folder_path, filename)

          with open(file_path, 'r', encoding='utf-8') as file:
              topic_number = file.readline().strip()
              topic_number = topic_number.replace("Topic: ", "")
              file.readline()
              title = file.readline().strip()
              title = title.replace("Title: ", "")

              topic_numbers.append(topic_number)
              titles.append(title)

  except FileNotFoundError:
      print(f"Folder not found: {folder_path}")
  except Exception as e:
      print(f"An error occurred: {e}")

  return topic_numbers, titles

**Export Results**

In [None]:
def postprocess(topic_number, query):
  path = "/content/results/" + str(topic_number) + ".txt"
  escaped_query = query.replace(" ", "%20")
  print (escaped_query)
  search_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&sort=relevance&term=' + \
                f'' + escaped_query + '' + \
                f'&retmax=10000'

  link_list = urllib.request.urlopen(search_url).read().decode('utf-8')
  summary = json.loads(link_list)
  pids = summary['esearchresult']['idlist']
  print (pids)

  with open(path, 'w') as file:
    for item in pids:
        file.write("%s\n" % item)

**Put Everything Together**

In [None]:
mesh_terms = []
mesh_definitions = []

topic_numbers, title = preprocess()

for n in range(len(title)):
  # Getting query from ChatGPT based on SLR title
  prompt = "Based on the following SLR title, please provide 5 complex pubmed Entrez formatted query without descriptions, in plain text, such that they may be used directly on Pubmed's website. Please do not include any MeSH terms: " + title[n]
  response = get_completion(prompt)
  response = response.splitlines()[0]
  if response.startswith('1. '):
      response = response[3:]

  # Splitting query into fragments
  query_fragments = split_query_to_frags(response)

  # Splitting fragments into atomic terms
  query_terms = split_frags_to_terms(query_fragments)

  # Getting MeSH terms from atomic term
  for l in range (len(query_terms)):
    for k in range (len(query_terms[l])):
      pmids = (search_pubmed(query_terms[l][k]))

      for i in range(len(pmids)):
          result = pubmed_data(int(pmids[i]))
          inputs = tokenizer_BERTMeSH([result], padding="max_length")
          labels = model_BERTMeSH(**inputs, return_labels=True)
          for j in range (len(labels[0])):
            if labels[0][j] not in mesh_terms:
              mesh_terms.append(labels[0][j])

      # Get MeSH term definitions
      for m in range (len(mesh_terms)):
        if " " in mesh_terms[m]:
          temp_mesh_term = mesh_terms[m].replace(" ", "%20")
        else:
          temp_mesh_term = mesh_terms[m]
        mesh_definitions.append(get_mesh_term_description(temp_mesh_term))

      # Semantic Similarity
      best_definition = semantic_similarity(query_terms[l][k], mesh_definitions)
      best_mesh_term = mesh_terms[best_definition]

      # Combine the free text term and MeSH term
      query_terms[l][k] = "(\"" + query_terms[l][k] + "\"" + " OR " + "\"" + best_mesh_term + "\"" + "[MeSH])"

      # Clear lists
      mesh_terms.clear()
      mesh_definitions.clear()

  # Reformat query into one string
  final_query = " AND ".join(["(" + " OR ".join(row) + ")" for row in query_terms])

  print(title[n])
  print(response)
  print(final_query)
  postprocess(topic_numbers[n], final_query)