In [2]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=5e13d4d375932f67cf9197cbae2aad6fad65fd6cb6512e280c6feb4c199ce661
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [3]:
import openai
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer, util
import wikipedia

# Initialize models
retriever_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')  # For semantic search
qa_model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)

def search_wikipedia(query, num_results=3):
    """Search Wikipedia and return the top results."""
    try:
        search_results = wikipedia.search(query, results=num_results)
        documents = []
        for result in search_results:
            try:
                summary = wikipedia.summary(result, sentences=3)
                documents.append((result, summary))
            except wikipedia.exceptions.DisambiguationError as e:
                # Handle disambiguation
                continue
        return documents
    except Exception as e:
        print(f"Error fetching data from Wikipedia: {e}")
        return []

def retrieve_context(query, documents):
    """Retrieve the most relevant context using semantic search."""
    corpus = [doc[1] for doc in documents]
    embeddings = retriever_model.encode(corpus, convert_to_tensor=True)
    query_embedding = retriever_model.encode(query, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, embeddings)
    best_index = scores.argmax().item()
    return documents[best_index]

def generate_answer(question, context):
    """Generate an answer using the QA model."""
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
    outputs = qa_model(**inputs)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = start_scores.argmax()
    end_idx = end_scores.argmax()
    answer = tokenizer.decode(inputs.input_ids[0][start_idx:end_idx+1])
    return answer

def astronomy_rag_qa():
    print("Welcome to the Astronomy RAG QA System!")
    print("Ask me any astronomy-related question. Type 'exit' to quit.")
    while True:
        question = input("\nYour Question: ")
        if question.lower() == 'exit':
            print("Goodbye!")
            break

        # Step 1: Retrieve documents from Wikipedia
        documents = search_wikipedia(question)
        if not documents:
            print("Sorry, I couldn't find relevant information. Please try a different question.")
            continue

        # Step 2: Retrieve the most relevant context
        title, context = retrieve_context(question, documents)
        print(f"\nUsing context from: {title}")

        # Step 3: Generate the answer
        answer = generate_answer(question, context)
        print(f"Answer: {answer}")

# Run the program
if __name__ == "__main__":
    astronomy_rag_qa()


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Welcome to the Astronomy RAG QA System!
Ask me any astronomy-related question. Type 'exit' to quit.

Your Question: What are neutron stars?

Using context from: Neutron star
Answer:  the collapsed core of a massive supergiant star

Your Question: What are red dwarfs and brown dwarfs?

Using context from: Brown dwarf
Answer: <s>

Your Question: exit
Goodbye!
