In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset, load_dataset
import pandas as pd

In [None]:
# Initialize tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'sep_token': '<SEP>', 'pad_token': '<PAD>'}
tokenizer.add_special_tokens(special_tokens_dict)

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

In [None]:
def load_and_prepare_data(file_path, tokenizer):
    """Load and prepare the dataset for a chatbot fine-tuning task."""
    # Load the dataset from the CSV file
    dataset = load_dataset('csv', data_files=file_path)

    # Function to concatenate question and answer for the tokenizer
    def tokenize_qa(examples):
        # Concatenate the question and answer with a separator
        # You might consider adding special tokens or separators if it helps your model
        qa_pairs = [f"<BOS> [User] {q} <SEP> [Bot] {a} <EOS>" for q, a in zip(examples['instruction'], examples['output'])]
        return tokenizer(qa_pairs, padding='max_length', truncation=True, max_length=512)


    # Apply tokenization to each QA pair in the dataset
    tokenized_datasets = dataset.map(tokenize_qa, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["instruction", "output"])
    return tokenized_datasets


In [None]:
file_paths = {
    "train": '/content/drive/MyDrive/df_train.csv',
    "validation": '/content/drive/MyDrive/df_val.csv'
}

In [None]:
# Load and prepare the datasets
datasets = load_and_prepare_data(file_paths, tokenizer)
train_dataset = datasets['train']
val_dataset = datasets['validation']

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,  # Evaluate every 500 steps
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

In [None]:
# Train the model
trainer.train()

In [None]:
model.save_pretrained('./gpt2_chatbot')
tokenizer.save_pretrained('./gpt2_chatbot')

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
tokenizer.push_to_hub("") #pushing tokenizeer to Huggingface 

In [None]:
model.push_to_hub("") ##pushing model to Huggingface 