# Question Answering with Hugging Face Transformers

This notebook demonstrates how to use pre-trained models for question answering tasks using Hugging Face Transformers.

## 1. Basic Question Answering with BERT

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Load pre-trained BERT model for question answering
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Create a question answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Example context and question
context = """
The Amazon rainforest, also known in English as Amazonia or the Amazon Jungle, 
is a moist broadleaf forest that covers most of the Amazon basin of South America. 
This basin encompasses 7,000,000 square kilometers (2,700,000 sq mi), of which 
5,500,000 square kilometers (2,100,000 sq mi) are covered by the rainforest.
"""

question = "How large is the Amazon basin?"

# Get answer
answer = qa_pipeline(question=question, context=context)
print(f"Question: {question}")
print(f"Answer: {answer['answer']}")
print(f"Confidence: {answer['score']:.4f}")

## 2. Multiple Questions on Same Context

In [None]:
# Multiple questions about the same context
questions = [
    "What is another name for the Amazon rainforest?",
    "How much area does the rainforest cover?",
    "What type of forest is the Amazon?",
    "Which continent is the Amazon rainforest located?"
]

print("Question Answering Results:")
print("=" * 50)

for i, question in enumerate(questions, 1):
    result = qa_pipeline(question=question, context=context)
    print(f"{i}. Q: {question}")
    print(f"   A: {result['answer']} (confidence: {result['score']:.3f})")
    print()

## 3. Question Answering on Custom Dataset

In [None]:
# Create a custom dataset for question answering
custom_data = [
    {
        "context": "Python is a high-level programming language created by Guido van Rossum. It was first released in 1991 and is known for its simple syntax and readability.",
        "questions": [
            "Who created Python?",
            "When was Python first released?",
            "What is Python known for?"
        ]
    },
    {
        "context": "Machine learning is a subset of artificial intelligence that enables computers to learn and make decisions from data without being explicitly programmed for every scenario.",
        "questions": [
            "What is machine learning?",
            "What does machine learning enable computers to do?",
            "Is machine learning part of artificial intelligence?"
        ]
    }
]

# Process custom dataset
print("Custom Dataset Question Answering:")
print("=" * 50)

for i, data in enumerate(custom_data, 1):
    print(f"\nDataset {i}:")
    print(f"Context: {data['context'][:100]}...")
    print("\nQuestions and Answers:")
    
    for j, question in enumerate(data['questions'], 1):
        result = qa_pipeline(question=question, context=data['context'])
        print(f"  {j}. Q: {question}")
        print(f"     A: {result['answer']} (score: {result['score']:.3f})")

## 4. Manual Tokenization for Question Answering

In [None]:
import torch

# Manual approach without pipeline
def answer_question(question, context, model, tokenizer):
    # Tokenize inputs
    inputs = tokenizer.encode_plus(
        question, 
        context, 
        add_special_tokens=True, 
        return_tensors="pt",
        max_length=512,
        truncation=True
    )
    
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get start and end positions
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    
    # Convert back to tokens and then to string
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    answer_tokens = tokens[start_index:end_index+1]
    
    # Convert tokens back to string
    answer = tokenizer.convert_tokens_to_string(answer_tokens)
    
    # Calculate confidence
    start_prob = torch.softmax(start_scores, dim=1)[0, start_index].item()
    end_prob = torch.softmax(end_scores, dim=1)[0, end_index].item()
    confidence = start_prob * end_prob
    
    return answer, confidence

# Test the manual function
question = "What does the Amazon basin encompass?"
answer, confidence = answer_question(question, context, model, tokenizer)

print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Confidence: {confidence:.4f}")

## 5. Comparing Different QA Models

In [None]:
# Compare different question answering models
models_to_compare = [
    "distilbert-base-cased-distilled-squad",
    "bert-large-uncased-whole-word-masking-finetuned-squad",
    "roberta-base-squad2"
]

test_context = """
Artificial Intelligence (AI) is intelligence demonstrated by machines, in contrast to 
the natural intelligence displayed by humans and animals. Leading AI textbooks define 
the field as the study of "intelligent agents": any device that perceives its environment 
and takes actions that maximize its chance of successfully achieving its goals.
"""

test_question = "What is artificial intelligence?"

print("Model Comparison Results:")
print("=" * 60)
print(f"Question: {test_question}\n")

for model_name in models_to_compare:
    try:
        # Create pipeline for each model
        qa_pipe = pipeline("question-answering", model=model_name)
        result = qa_pipe(question=test_question, context=test_context)
        
        print(f"Model: {model_name}")
        print(f"Answer: {result['answer']}")
        print(f"Confidence: {result['score']:.4f}")
        print("-" * 40)
        
    except Exception as e:
        print(f"Error loading {model_name}: {str(e)}")
        print("-" * 40)