In [14]:
#!pip install nltk scikit-learn gradio datasets

In [15]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("Amirkid/MedQuad-dataset")


In [16]:
data = dataset['train']
questions = [data[i]['text'] for i in range(0, len(data), 2)]
answers = [data[i]['text'] for i in range(1, len(data), 2)]


In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Preprocess function
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

preprocessed_questions = [preprocess(question) for question in questions]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_questions)


In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_most_relevant_answer(question):
    question_tfidf = vectorizer.transform([preprocess(question)])
    similarities = cosine_similarity(question_tfidf, tfidf_matrix)
    most_relevant_index = np.argmax(similarities)
    return answers[most_relevant_index]


In [20]:
import gradio as gr

def answer_question(question):
    answer = find_most_relevant_answer(question)
    return f"Answer:\n{answer}"

# Create the Gradio interface
interface = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(lines=2, placeholder="Ask a medical question..."),
    outputs="text",
    title="Medical QA Chatbot",
    description="Ask any medical question and get an answer from the MedQuad dataset."
)

# Launch the interface
interface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://93e9192e059fd7e232.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


