#  Install Required Libraries

In [1]:
!pip install transformers torch



In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


# Define Your Data

In [11]:
# Define your questions, context, and answers
context = "The capital of France is Paris. The largest planet is Jupiter. William Shakespeare wrote 'Hamlet'. The boiling point of water is 100 degrees Celsius."
questions = [
    "What is the capital of France?",
    "What is the largest planet?",
    "Who wrote 'Hamlet'?",
    "What is the boiling point of water?"
]

answers = [
    "Paris",
    "Jupiter",
    "William Shakespeare",
    "100 degrees Celsius"
]


# Import Necessary Libraries

In [39]:
import torch
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from torch.utils.data import Dataset


# Tokenize the Data

In [28]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Function to find start and end positions of answers
def find_answer_positions(context, answer):
    start_idx = context.find(answer)
    end_idx = start_idx + len(answer)
    return start_idx, end_idx

# Create a list to hold the start and end positions
start_positions = []
end_positions = []

for answer in answers:
    start, end = find_answer_positions(context, answer)
    start_positions.append(start)
    end_positions.append(end)

# Tokenize the questions and context
encodings = tokenizer(
    questions,
    text_pair=[context] * len(questions),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

# Add start and end positions to the encodings
encodings['start_positions'] = torch.tensor(start_positions)
encodings['end_positions'] = torch.tensor(end_positions)

# Create a Dataset Class

In [29]:
# Create a Dataset class
class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create the dataset
dataset = QADataset(encodings)

# Train the Model

In [41]:
# Set up training arguments without WandB
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=64,
    per_device_train_batch_size=2,
    logging_dir='./logs',
    report_to='none',  # Disable WandB logging
    dataloader_pin_memory=False  # Disable pin_memory
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=128, training_loss=0.00047957332571968436, metrics={'train_runtime': 4.1052, 'train_samples_per_second': 62.36, 'train_steps_per_second': 31.18, 'total_flos': 2809041650688.0, 'train_loss': 0.00047957332571968436, 'epoch': 64.0})

# Make Predictions

In [46]:
def answer_question(question):
    inputs = tokenizer(question, context, return_tensors='pt').to(device)  # Move inputs to the correct device
    outputs = model(**inputs)

    # Get the start and end logits
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Find the positions of the highest logits
    start_index = torch.argmax(start_logits).item()
    end_index = torch.argmax(end_logits).item()

    # Extract the answer
    answer_tokens = inputs['input_ids'][0][start_index:end_index + 1]
    answer = tokenizer.decode(answer_tokens)

    return answer.strip()

In [47]:
# Test the function
print(answer_question("What is the capital of France?"))

' hamlet '. the boiling


In [48]:
# Test the function
print(answer_question("Who wrote 'Hamlet'?"))

' hamlet '. the boiling
