Add `%load_ext cudf.pandas` before importing pandas to speed up operations using GPU

Add `%load_ext cudf.pandas` before importing pandas to speed up operations using GPU

print

In [23]:
import os
import pickle
import numpy as np
from openai import OpenAI
from transformers import AutoModel, AutoTokenizer
import torch
import gradio as gr

# Your Hugging Face Token
HF_TOKEN = "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"

# Paths to your preprocessed files
chunks_file = "/content/chunks.pkl"
embeddings_file = "/content/embeddings (1).pkl"

# Model names
model_name = "openai/gpt-oss-120b"
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

# Load data
with open(chunks_file, "rb") as f:
    chunks = pickle.load(f)
with open(embeddings_file, "rb") as f:
    embeddings = pickle.load(f)

# Search function
def search_chunks(query_embedding, top_k=3):
    dot_products = np.dot(embeddings, query_embedding)
    norms = np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_embedding)
    similarities = dot_products / norms
    top_indices = similarities.argsort()[-top_k:][::-1]
    return [chunks[i] for i in top_indices]

# HF client
client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=HF_TOKEN
)

# Embedding model
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
model = AutoModel.from_pretrained(embedding_model_name)

def get_embedding(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embedding = model_output.last_hidden_state.mean(dim=1)
    return sentence_embedding.squeeze().numpy()

def ask_question(question):
    query_embedding = get_embedding(question)
    context_chunks = search_chunks(query_embedding, top_k=3)
    context_text = "\n\n".join(context_chunks)

    completion = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are a helpful biology assistant."},
            {"role": "user", "content": f"Context:\n{context_text}\n\nQuestion: {question}"}
        ],
    )
    return completion.choices[0].message.content

# UI function
def chatbot(message, history):
    answer = ask_question(message)
    return answer

# Gradio Chat Interface
chat_ui = gr.ChatInterface(
    fn=chatbot,
    chatbot=gr.Chatbot(height=500, label=" 🧬BioPandora", type='messages'),
    textbox=gr.Textbox(
        placeholder="Type your biology question here...",
        container=False,
        autofocus=True,
        scale=7
    ),
    title="💬 BioPandora",
    description="An AI-powered biology assistant using GPT-OSS-120B.",
    theme="glass",
    examples=["What is photosynthesis?", "Explain DNA replication.", "What is cell mitosis?"],
)

if __name__ == "__main__":
    chat_ui.launch()



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f0616a1643a698edd4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
