## FINE TUNING GPT-2

#### Install necessary packages

In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m103.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m91.5 MB/s[0m eta [36m0:00:00[0m
Co

#### Download shakespeare dataset / upload already split dataset on colab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import gpt_2_simple as gpt2
import os
import requests


#### Finetuning the model

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load the GPT-2 model and tokenizer
model_name = "gpt2"
config = GPT2Config.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Define the dataset for training
train_file = "/content/combined.txt"
test_file = "/content/combined_test.txt"

def load_dataset(train_file, test_file, tokenizer):
    train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_file, block_size=128)
    test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_file, block_size=128)
    return train_dataset, test_dataset

train_dataset, test_dataset = load_dataset(train_file, test_file, tokenizer)

# Define the data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Save the fine-tuned model
#trainer.save_model("fine_tuned_gpt2")


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,2.2675,2.008694
2,2.0889,1.988575
3,1.9411,1.991719


TrainOutput(global_step=2163, training_loss=2.066492342585168, metrics={'train_runtime': 704.9097, 'train_samples_per_second': 24.531, 'train_steps_per_second': 3.068, 'total_flos': 1129565454336000.0, 'train_loss': 2.066492342585168, 'epoch': 3.0})

#### Save the model as well as te tokenizer

In [None]:
trainer.save_model("/content/drive/MyDrive/MOdel_params/fine_tuned_gpt2_shakespeare_11")
tokenizer.save_pretrained("/content/drive/MyDrive/MOdel_params/fine_tuned_gpt2_shakespeare_11")

Using pad_token, but it is not set yet.
Using pad_token, but it is not set yet.


('/content/drive/MyDrive/MOdel_params/fine_tuned_gpt2_shakespeare_11/tokenizer_config.json',
 '/content/drive/MyDrive/MOdel_params/fine_tuned_gpt2_shakespeare_11/special_tokens_map.json',
 '/content/drive/MyDrive/MOdel_params/fine_tuned_gpt2_shakespeare_11/vocab.json',
 '/content/drive/MyDrive/MOdel_params/fine_tuned_gpt2_shakespeare_11/merges.txt',
 '/content/drive/MyDrive/MOdel_params/fine_tuned_gpt2_shakespeare_11/added_tokens.json')




#### Test the fine tuned model

In [3]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned GPT-2 model and tokenizer
model_name = "/content/drive/MyDrive/MOdel_params/fine_tuned_gpt2_shakespeare_11"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Function to generate responses
def generate_response(prompt_text, model, tokenizer, max_length=50, num_return_sequences=1):
    input_ids = tokenizer.encode(prompt_text, return_tensors="pt")

    # Generate response
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        temperature=0.7,
        top_p=0.9,
    )

    # Decode the generated responses
    responses = []
    for response_id in output_sequences:
        response = tokenizer.decode(response_id, skip_special_tokens=True)
        responses.append(response)

    return responses



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Test the model with a prompt
prompt_text = "<s> I am so sad today .</s> >>>>>> <p>"
responses = generate_response(prompt_text, model, tokenizer)

for response in responses:
    print(response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<s> I am so sad today.</s> >>>>>> <p>I am sad now., Come, come, I'll weep.<p>"
<d>You're not going to be able to sleep tonight?</
