In [58]:
!pip install sentence-transformers transformers torch accelerate gradio --quiet

In [59]:
!pip install faiss-cpu



In [60]:
import faiss

In [61]:
import os
import requests
import json
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr

In [62]:
import os
import requests

data_dir = "/content/pubmed_20k"
os.makedirs(data_dir, exist_ok=True)

url_train = "https://huggingface.co/datasets/armanc/pubmed-rct20k/resolve/main/train.jsonl"
file_train = os.path.join(data_dir, "pubmed-20k-train.jsonl")

if not os.path.exists(file_train):
    print("Downloading PubMed 20k RCT train dataset...")
    r = requests.get(url_train, stream=True)
    with open(file_train, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete!")
else:
    print("Dataset already exists!")

Dataset already exists!


In [63]:
import re
import json

def clean_text(text):
    # remove placeholder
    text = re.sub(r'[#@<>/▃]', '', text)
    # remove space
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'(\d)\s+\.', r'\1.', text)
    text = re.sub(r'\.\s+(\d)', r'.\1', text)
    text = text.strip()
    return text

def parse_pubmed_rct_jsonl(path):
    texts = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            cleaned = clean_text(data["text"])
            texts.append(cleaned)
    return texts


In [64]:
embedding_model = SentenceTransformer("allenai-specter", device='cuda')
print("GPU available:", torch.cuda.is_available())

GPU available: True


In [65]:
texts_sample = texts[:32]  #only 32 paragraph

embeddings = []
batch_size = 8

for i in range(0, len(texts_sample), batch_size):
    batch_texts = texts_sample[i:i+batch_size]

    # create embedding
    batch_emb = embedding_model.encode(batch_texts, convert_to_tensor=True, device='cuda')

    #  diffusion
    batch_emb_denoised = diffusion_denoise(batch_emb)

    embeddings.append(batch_emb_denoised)

embeddings = torch.cat(embeddings)
print("Embeddings created with diffusion denoising")
print("Device of embeddings:", embeddings.device)
print("Shape of embeddings:", embeddings.shape)

Embeddings created with diffusion denoising
Device of embeddings: cuda:0
Shape of embeddings: torch.Size([32, 768])


In [66]:
emb_np = embeddings.detach().cpu().numpy()
dim = emb_np.shape[1]

# create index CPU
index = faiss.IndexFlatL2(dim)
index.add(emb_np)

print("VectorDB ready on CPU")
print("Number of vectors in index:", index.ntotal)

VectorDB ready on CPU
Number of vectors in index: 32


In [67]:
!pip install sacremoses



In [68]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

llm_model_name = "microsoft/BioGPT-Large"

tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForCausalLM.from_pretrained(llm_model_name).to('cuda')

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0
)

#  test
prompt = "Describe the function of the liver in humans."
output = generator(prompt, max_length=100, do_sample=True)
print(output[0]['generated_text'])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Describe the function of the liver in humans. < / FREETEXT > < / TITLE > ▃


In [69]:
def rag_medical(query, top_k=3):
    # gpu on embedding query
    q_emb = embedding_model.encode([query], convert_to_tensor=True, device='cuda')
    q_emb = diffusion_denoise(q_emb)

    # VectorDB (CPU index)
    D, I = index.search(q_emb.cpu().numpy(), top_k)
    retrieved_texts = [texts[i] for i in I[0]]

    # prompt
    prompt = (
        "Relevant medical excerpts (PubMed 20k RCT):\n\n" +
        "\n".join(retrieved_texts) +
        "\n\nGenerate a clear research summary or explanation (educational only, no medical advice):\n"
    )

    # LLM
    output = generator(
        prompt,
        max_length=150,
        do_sample=True,
        temperature=0.7
    )[0]['generated_text']

    return output

# test
query = "Latest treatments for type 2 diabetes complications"
print(rag_medical(query))


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Relevant medical excerpts (PubMed 20k RCT):

Further , there was a clinically relevant reduction in the serum levels of IL- , IL- , TNF - , and hsCRP at weeks in the intervention group when compared to the placebo group .
The mean difference between treatment arms ( % CI ) was ( - ) , p ; ( - ) , p ; ( - ) , p ; and ( - ) , p , respectively .
These differences remained significant at weeks .

Generate a clear research summary or explanation (educational only, no medical advice):
 Mean difference between treatment arms (% CI) was –, p; (-), p; (-), p; and (-), p, respectively. For all the values, p < 0. 0 5. < / FREETEXT > < / PARAGRAPH > ▃ < PARAGRAPH > < FREETEXT > Relevance statement: The mean differences between treatment arms (% CI) were (-), p; (-), p; (-), p; and (-), p, respectively. For all the values, p < 0. 0 5. < / FREETEXT > < / PARAGRAPH > ▃ < PARAGRAPH > < FREETEXT > Conclusion: The present study demonstrates that a structured education programme is an effective strategy 

In [70]:
# Gradio
def ui(query):
    return rag_medical(query)

iface = gr.Interface(
    fn=ui,
    inputs=gr.Textbox(lines=2, placeholder="Enter a medical research query..."),
    outputs="textbox",
    title="Medical RAG + GenAI + Diffusion",
    description="Search PubMed 20k RCT using embeddings, diffusion, and LLM (educational only)."
)

iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://654da9312cf20c94dc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


