In [None]:
simple_clinical_question = "What is the most effective treatment for patients with chronic migraine compared to drug therapy in reducing pain severity?"


In [None]:
#First, install all of the requirements
!pip install requests
!pip install biopython
!pip install openai==0.28
!pip install transformers
!pip install numpy



In [None]:
from Bio import Entrez
Entrez.email = ""


In [None]:
import re

def extract_pico_from_question(question):
    # Define improved patterns for each component
    patterns = {
        'Patient': r'\b(?:for|in)\s([A-Za-z\s]+)\s?(?=(?:with|\w))',  # Captures 'for patients with ...' or 'in patients with'
        'Intervention': r'\b(?:treatment|therapy|intervention|medication|drug)\s([A-Za-z\s]+)\b',  # Captures interventions
        'Comparison': r'\b(?:compared|vs|versus)\s([A-Za-z\s]+)\b',  # Captures comparisons like 'vs' or 'compared to'
        'Outcome': r'\b(?:lead to|improve|reduce|increase|in|effectiveness)\s([A-Za-z\s]+)\b',  # Captures outcomes like 'lead to better...'
    }

    # Initialize an empty dictionary to store PICO components
    pico_components = {'Patient': None, 'Intervention': None, 'Comparison': None, 'Outcome': None}

    # Loop through each pattern and try to find matches
    for component, pattern in patterns.items():
        match = re.search(pattern, question, re.IGNORECASE)
        if match:
            pico_components[component] = match.group(1).strip()

    # Return the extracted PICO components
    return pico_components

pico_result = extract_pico_from_question(simple_clinical_question)
print(pico_result)


{'Patient': 'patients with chronic migraine compared to drug therapy in reducing pain severit', 'Intervention': 'for patients with chronic migraine compared to drug therapy in reducing pain severity', 'Comparison': 'to drug therapy in reducing pain severity', 'Outcome': 'reducing pain severity'}


In [None]:
query = ""
query_terms = ""

In [None]:
idList = []
handle = Entrez.esearch(db="mesh", term=pico_result['Patient'])
record = Entrez.read(handle)
handle.close()
mesh_terms = []
for translation in record['TranslationSet']:
    terms = translation['To'].split(' OR ')
    for term in terms:
        if '[MeSH Terms]' in term:
            mesh_terms.append(term.replace('[MeSH Terms]', '').replace('"', '').strip())
query_terms = [f"{term}" for term in mesh_terms]
query = " AND ".join(query_terms)
p_query = query
print(p_query)

drug therapy AND migraine disorders AND pain AND patients


In [None]:
handle = Entrez.esearch(db="mesh", term=pico_result['Intervention'])
record = Entrez.read(handle)
handle.close()
# Extract MeSH terms from the result
mesh_terms = []
for translation in record['TranslationSet']:
    terms = translation['To'].split(' OR ')
    for term in terms:
        if '[MeSH Terms]' in term:
            mesh_terms.append(term.replace('[MeSH Terms]', '').replace('"', '').strip())

query_terms = [f"{term}" for term in mesh_terms]
query = " OR ".join(query_terms)
i_query = query
print(i_query)

drug therapy OR migraine disorders OR pain measurement OR patients


In [None]:
handle = Entrez.esearch(db="mesh", term=pico_result['Comparison'])
record = Entrez.read(handle)
handle.close()
mesh_terms = []
for translation in record['TranslationSet']:
    terms = translation['To'].split(' OR ')
    for term in terms:
        if '[MeSH Terms]' in term:
            mesh_terms.append(term.replace('[MeSH Terms]', '').replace('"', '').strip())
query_terms = [f"{term}" for term in mesh_terms]
query = " OR ".join(query_terms)
c_query = query
print(c_query)

drug therapy OR pain measurement


In [None]:
handle = Entrez.esearch(db="mesh", term=pico_result['Outcome'])
record = Entrez.read(handle)
handle.close()
mesh_terms = []
for translation in record['TranslationSet']:
    terms = translation['To'].split(' OR ')
    for term in terms:
        if '[MeSH Terms]' in term:
            mesh_terms.append(term.replace('[MeSH Terms]', '').replace('"', '').strip())
query_terms = [f"{term}" for term in mesh_terms]
query = " OR ".join(query_terms)
o_query = query
print(o_query)

pain measurement


In [None]:
final_query = f"({p_query}) AND ({i_query}) AND ({c_query}) AND ({o_query})"
print(final_query)

(drug therapy AND migraine disorders AND pain AND patients) AND (drug therapy OR migraine disorders OR pain measurement OR patients) AND (drug therapy OR pain measurement) AND (pain measurement)


In [None]:
handle = Entrez.esearch(db="pubmed", term=final_query)
record = Entrez.read(handle)
handle.close()
idlist = record['IdList']
print(idlist)
print(record['Count'])


['39334340', '38820488', '38814378', '38812648', '38549121', '37873925', '37463388', '37438124', '37389229', '37104222', '36668849', '35883701', '35674078', '35400198', '35400174', '35285422', '34997178', '34528122', '34472095', '34407654']
391


In [None]:
from Bio import Medline
handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",retmode="text")
records = Medline.parse(handle)
records = list(records)
handle.close()

In [None]:
articles = []

for record in records:
    title = record.get("TI", "?")
    author = record.get("AU", "?")
    journal = record.get("TA", "?")
    date_of_publication = record.get("DP", "?")
    abstract = record.get("AB", "?")
    keywords = record.get("OT", "?")
    mesh_terms =record.get("MH", "?")
    articles.append((title, abstract, journal, author, date_of_publication, keywords, mesh_terms))

In [None]:
print(articles.__len__())


20


In [None]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def embed_text(text):
    if not text:
        return None  # or return a zero vector or another placeholder

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs['pooler_output'].numpy()


vectors = [embed_text(article[1]) for article in articles if article[1]]
vectors = [v for v in vectors if v is not None]
print(f"Number of vectors: {len(vectors)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Number of vectors: 20


In [None]:
!pip install faiss-cpu # Install the faiss library
import faiss
import numpy as np

# Convert vectors list to a 2D numpy array
vectors_matrix = np.vstack(vectors)

# Build the index
index = faiss.IndexFlatL2(vectors_matrix.shape[1])
index.add(vectors_matrix)


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
# Join extracted PICO components into a single query string
pico_res = " ".join([f"{k}: {v}" for k, v in pico_result.items() if v])

# Check if pico_res is valid before embedding
if pico_res and isinstance(pico_res, str) and len(pico_res) > 0:
    try:
        # Embed the extracted PICO components
        query_text = pico_res
        query_vector = embed_text(query_text)

        # Display results
        print(f"Query Embedding Successful!")
        print(f"Query Vector Shape: {query_vector.shape}")
        print(f"Query Text: {query_text}")

    except Exception as e:
        print(f"Error during text embedding: {str(e)}")
else:
    print("Warning: 'pico_res' is either empty or invalid. Please check its value.")

Query Embedding Successful!
Query Vector Shape: (1, 768)
Query Text: Patient: patients with chronic migraine compared to drug therapy in reducing pain severit Intervention: for patients with chronic migraine compared to drug therapy in reducing pain severity Comparison: to drug therapy in reducing pain severity Outcome: reducing pain severity


In [None]:
# Define the number of nearest neighbors you want to retrieve
if len(vectors) >= 5: k = 5
else: k = len(vectors)

# Search the index for the k-nearest vectors
D, I = index.search(query_vector, k)

# D contains the distances, and I contains the indices of the nearest vectors
nearest_articles = [articles[i] for i in I[0]]  # I[0] because I is a 2D array

In [None]:
# Now, print the nearest articles:
s =""
for idx, article in enumerate(nearest_articles):
    title, abstract, journal, author, date_of_publication, keywords, mesh_terms = article
    s = s + "Title: " + title + "\n"
    s = s + "Abstract: " + abstract + "\n"
    s = s + "Journal: " + journal + "\n"
    s = s + "Author: " + ", ".join(author) + "\n"
    s = s + "Date of publication: " + date_of_publication + "\n"
    s = s + "Keywords: " + ", ".join(keywords) + "\n"
    s = s + "Mesh terms: " + ", ".join(mesh_terms) + "\n"
print(s)

Title: Quantitative and Qualitative Pain Evaluation in Response to OnabotulinumtoxinA for Chronic Migraine: An Observational Real-Life Study.
Abstract: (1) Background: Randomized controlled trials and real-life studies demonstrated the efficacy of OnabotulinumtoxinA (OBT-A) for CM prevention. However, no studies specifically addressed its effect on pain's quantitative intensity and qualitative characteristics. (2) Methods: This is an ambispective study: a post-hoc retrospective analysis of real-life prospectively collected data from two Italian headache centers on CM patients treated with OBT-A over one year (i.e., Cy1-4). The primary endpoint was the changes in pain intensity (Numeric Rating Scale, NRS; the Present Pain Intensity (PPI) scale, the 6-point Behavioral Rating Scale (BRS-6)) and quality scale (the short-form McGill Pain Questionnaire (SF-MPQ)) scores. We also assessed the relationship between changes in intensity and quality of pain and disability scale (MIDAS; HIT-6) scor

In [None]:
!pip install transformers torch




In [None]:
from huggingface_hub import login

# Connectez-vous à Hugging Face en utilisant votre token d'authentification
login(token="")


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Charger le modèle LLaMA et le tokenizer
model_name = "facebook/bart-large-cnn"  # Exemple, ajustez le nom du modèle si nécessaire
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Fonction pour générer une réponse avec LLaMA
def generate_with_llama(prompt):
    # Tokeniser l'entrée
    # The max_length parameter here controls the length of the input sequence.
    # If the input is longer than max_length, it will be truncated.
    # If it's shorter, it will be padded.
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Générer une réponse
    # The max_length parameter in model.generate() controls the total length of the generated sequence,
    # including the input sequence.
    # By setting max_new_tokens, you control how many new tokens the model can generate.
    outputs = model.generate(**inputs, max_new_tokens=512, num_return_sequences=1, no_repeat_ngram_size=2, temperature=0.7)

    # Décoder la réponse générée
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Préparer le prompt en utilisant les résumés et la question PICO
prompt = f"Act as an evidenced-based clinical researcher. Using only the following PubMed Abstracts to guide your content ({s}), create an evidence-based medicine report that answers the following question: {pico_res}"

# Générer la réponse avec LLaMA
research_res = generate_with_llama(prompt)

# Afficher la question et la réponse générée
print(pico_res)
print(research_res)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]



Patient: patients with chronic migraine compared to drug therapy in reducing pain severit Intervention: for patients with chronic migraine compared to drug therapy in reducing pain severity Comparison: to drug therapy in reducing pain severity Outcome: reducing pain severity
Act as an evidenced-based clinical researcher. Using only the following PubMed Abstracts to guide your content (Title: Quantitative and Qualitative Pain Evaluation in Response to OnabotulinumtoxinA for Chronic Migraine: An Observational Real-Life Study.
Abstract: (1) Background: Randomized controlled trials and real-life studies demonstrated the efficacy of OnabotulinumtoxinA (OBT-A) for CM prevention. However, no studies specifically addressed its effect on pain's quantitative intensity and qualitative characteristics. (2) Methods: This is an ambispective study: a post-hoc retrospective analysis of real-life prospectively collected data from two Italian headache centers on CM patients treated with OBT-A over one y

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Charger le modèle BART (ou un autre modèle de résumé comme T5)
model_name = "facebook/bart-large-cnn"  # Modèle spécifiquement pour les tâches de résumé
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Fonction pour générer un résumé avec le modèle
def generate_summary(prompt):
    # Tokeniser l'entrée (ici, pour des résumés plus longs)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024)

    # Générer un résumé
    outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1, no_repeat_ngram_size=2, temperature=0.7)

    # Décoder la réponse générée (résumé)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Préparer le prompt en demandant un résumé des abstracts
prompt = f"Please summarize the following PubMed Abstracts: {s}"

# Générer le résumé
summary_res = generate_summary(prompt)

# Afficher la question et le résumé généré
print("Summary:")
print(summary_res)


Summary:
OnabotulinumtoxinA (OBT-A) has been shown to reduce pain intensity in chronic migraine patients. This study was a retrospective analysis of real-life prospectively collected data from two Italian headache centers on CM patients treated with OBT-a over one year. Only throbbing, splitting and sickening qualities of pain collected in the SF-MPQ were reduced.


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Chargement du modèle T5 pré-entraîné
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Texte à résumer

# Encodage du texte
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024)

# Access the 'input_ids' tensor within the 'inputs' object to get the shape
summary_ids = model.generate(inputs["input_ids"], max_length=100, min_length=20, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


print("Résumé :", summary)


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Résumé : a meta-analysis has evaluated the efficacy and safety of lasmiditan for acute treatment of migraine in adults . a total of 152 patients were treated with OBT-A over a one-year period . results: throbbing, splitting, and sickening qualities of pain were reduced .
