In [None]:
!pip install transformers datasets

In [None]:
from datasets import load_dataset

# Load the dataset
raw_dataset = load_dataset("toughdata/quora-question-answer-dataset")

# Check the first few samples
print(raw_dataset["train"].to_pandas().head())

In [None]:
from transformers import AutoTokenizer

# Choose a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples['question'], examples['answer'], padding="max_length", truncation=True)

tokenized_dataset = raw_dataset.map(preprocess_function, batched=True)

In [None]:
from datasets import DatasetDict

# Split the train dataset into train and validation sets
train_test_split = tokenized_dataset["train"].train_test_split(test_size=0.1)

# Create a DatasetDict
dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})


In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

# Load a pre-trained model
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define a trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"]
)

# Train the model
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
model.save_pretrained("./qa_model")
tokenizer.save_pretrained("./qa_model")

In [None]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
qa_pipeline = pipeline("question-answering", model="./qa_model", tokenizer="./qa_model")

# Define a function to answer questions
def answer_question(question, context):
    return qa_pipeline(question=question, context=context)

# Example usage
context = "The Quora Question Answer Dataset is a resource for training AI models to understand and generate accurate responses."
question = "What is the Quora Question Answer Dataset?"
answer = answer_question(question, context)

print(f"Question: {question}")
print(f"Answer: {answer['answer']}")