In [None]:
# Load model directly
! pip install pandas matplotlib seaborn wordcloud nltk transformers datasets torch

In [None]:
from datasets import load_dataset

dataset = load_dataset("alespalla/chatbot_instruction_prompts")

In [None]:
import pandas as pd

df = pd.DataFrame(dataset['train'])  # Assuming you want to visualize the 'train' split
print(df.info())
print(df.head())

In [None]:
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples['prompt'],
        examples['response'],
        truncation="only_second",  # Truncate only the second part (response)
        max_length=1024,            # Set the maximum sequence length
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"       # Pad shorter sequences
    )
    sample_map = tokenized_examples.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        tokenized_examples[key] = [values[i] for i in sample_map]

    return tokenized_examples

tokenized_dataset = dataset.map(prepare_train_features, batched=True)



In [None]:
tokenized_dataset['train'][0]

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

In [46]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Whatâ€™s the best way to fix my kitchen drain?',
    'context': 'hi'
}
res = nlp(QA_input)

In [47]:
res

{'score': 4.460881797285765e-08, 'start': 0, 'end': 2, 'answer': 'hi'}

In [None]:
from transformers import TrainingArguments, Trainer

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test']  # or 'validation', if available
)

# Train the model
trainer.train()


In [None]:
# Save the model
model.save_pretrained('./fine_tuned_model')

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)
