In [2]:
# from datasets import load_dataset
import json
from sklearn.model_selection import train_test_split
from huggingface_hub import login
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader

## Data Preprocessing

In [3]:
def dataset_gen(text_file):
    with open(text_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    texts = [line.strip() for line in lines if len(line.strip()) > 0]
    train_texts, val_texts = train_test_split(texts, test_size=0.1)
    return train_texts, val_texts

In [4]:
train_texts, val_texts = dataset_gen('../../data/mahabharat.txt')

In [5]:
print(f"Number of training samples: {len(train_texts)}")
print(f"Number of validation samples: {len(val_texts)}")
print(f"Example training sample:\n{train_texts[0]}")

Number of training samples: 2327
Number of validation samples: 259
Example training sample:
"Afflicted by the god of love," Kichaka said, "I will come alone so that your five husbands will not know of our love affair."


## Fine Tuning

### Tokenize Data

In [6]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
def tokenize_data(texts):
    encodings = tokenizer(texts, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    encodings["labels"] = encodings["input_ids"].clone()
    return encodings

In [8]:
train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)

sample_length = 5
print(f"Example training input dimensions: {train_encodings['input_ids'][0].shape}")
print(f"Example word embedding:\nText: {train_texts[0].split(' ')[:sample_length]}\nEmbedding: {train_encodings['input_ids'][0][:sample_length+2]}")

Example training input dimensions: torch.Size([512])
Example word embedding:
Text: ['"Afflicted', 'by', 'the', 'god', 'of']
Embedding: tensor([    1, 35191, 17823,   416,   262,  5770,   286])


### Create Datasets

In [9]:
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        return item

In [10]:
train_dataset = TextDataset(train_encodings)
val_dataset = TextDataset(val_encodings)

### Fine tuning the model

In [11]:
model = GPT2LMHeadModel.from_pretrained(model_name)

In [16]:
training_args = TrainingArguments(
    output_dir="../../results",
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [18]:
trainer.train()

Step,Training Loss
500,1.1178
1000,0.8016
1500,0.736


TrainOutput(global_step=1746, training_loss=0.8649477701962746, metrics={'train_runtime': 1136.4326, 'train_samples_per_second': 6.143, 'train_steps_per_second': 1.536, 'total_flos': 1824079675392000.0, 'train_loss': 0.8649477701962746, 'epoch': 3.0})

In [19]:
model.save_pretrained("../../fine-tuned-gpt2")
tokenizer.save_pretrained("../../fine-tuned-gpt2")

('./fine-tuned-gpt2/tokenizer_config.json',
 './fine-tuned-gpt2/special_tokens_map.json',
 './fine-tuned-gpt2/vocab.json',
 './fine-tuned-gpt2/merges.txt',
 './fine-tuned-gpt2/added_tokens.json')

## Inference

In [25]:
model_path = "../../fine-tuned-gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [43]:
def generate_story(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    attention_mask = input_ids.ne(tokenizer.pad_token_id)
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=250, num_return_sequences=1, no_repeat_ngram_size=2, do_sample=True, top_k=50, top_p=0.95)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [50]:
story = "Once upon a time in the Mahabharat, Arjuna"
gen_story = generate_story(story)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [51]:
print(story, gen_story)

Once upon a time in the Mahabharat, Arjuna Once upon a time in the Mahabharat, Arjuna attacked Lord Krishna, the son of Panchala. After killing five hundred kshatriya warriors, they all came out of the battlefield with broken limbs and their heads tied by ropes. Lord Shiva addressed them, "O pious son, please give me protection from these fierce Bhishma's arrows. I have been slain by the sons of Pritha's son in battle. O son, please accept the blessings of victory and slay my sons, O Krishna!" The warrior who had been defeated, however, then began to fight with Arjun and killed his two horses and charioteer.
