In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

# Model V1

In [2]:
# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Prepare your dataset
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="processed_dialogs.txt",
    block_size=128)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

  0%|          | 0/15813 [00:00<?, ?it/s]

{'loss': 2.4152, 'grad_norm': 3.140174388885498, 'learning_rate': 4.8419022323404794e-05, 'epoch': 0.03}
{'loss': 2.3712, 'grad_norm': 2.992849826812744, 'learning_rate': 4.6838044646809586e-05, 'epoch': 0.06}
{'loss': 2.3226, 'grad_norm': 2.2799367904663086, 'learning_rate': 4.5257066970214385e-05, 'epoch': 0.09}
{'loss': 2.3184, 'grad_norm': 2.223583459854126, 'learning_rate': 4.367608929361918e-05, 'epoch': 0.13}
{'loss': 2.266, 'grad_norm': 2.1571054458618164, 'learning_rate': 4.209511161702397e-05, 'epoch': 0.16}
{'loss': 2.2801, 'grad_norm': 2.1511244773864746, 'learning_rate': 4.051413394042876e-05, 'epoch': 0.19}
{'loss': 2.2761, 'grad_norm': 2.174072504043579, 'learning_rate': 3.893315626383356e-05, 'epoch': 0.22}
{'loss': 2.2812, 'grad_norm': 1.7256395816802979, 'learning_rate': 3.735217858723835e-05, 'epoch': 0.25}
{'loss': 2.2461, 'grad_norm': 2.0350048542022705, 'learning_rate': 3.5771200910643144e-05, 'epoch': 0.28}
{'loss': 2.2678, 'grad_norm': 2.028801918029785, 'learni

('./gpt2-finetuned\\tokenizer_config.json',
 './gpt2-finetuned\\special_tokens_map.json',
 './gpt2-finetuned\\vocab.json',
 './gpt2-finetuned\\merges.txt',
 './gpt2-finetuned\\added_tokens.json')

In [4]:
# Load fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

# Prepare input
input_text = "<genre:comedy,romance><char:BIANCA><char:CAMERON>\nBIANCA: They're having a fight... again."

# Encode input
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate output
output = model.generate(input_ids, max_length=200, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode and print the result
print(tokenizer.decode(output[0], skip_special_tokens=False))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<genre:comedy,romance><char:BIANCA><char:CAMERON>
BIANCA: They're having a fight... again.
CAMPBELL: I'm sorry, I didn't mean to...
BANDA: You're not going to be able to get out of here.


<char:"BIanCA"><char":CEMETER>

<family:biography,drama,history><family:"BENNY><b:BUD><d:DOUG>BUNNY: What's the matter?
DUDG: Nothing.  I just want to talk to you. I want you to know that I love you, and I don't want anyone to see that. You know, you're a good man. And I know you love me. But I can't let you go. It's not right. We're going back to the hotel. The hotel is closed


# Model V2

In [10]:
# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

class DialogDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        with open(file_path, "r") as f:
            self.dialogs = f.read().split("\n\n")
        
    def __len__(self):
        return len(self.dialogs)
    
    def __getitem__(self, idx):
        dialog = self.dialogs[idx]
        encodings = self.tokenizer(dialog, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {key: torch.squeeze(val) for key, val in encodings.items()}
    
# Prepare your dataset
train_dataset = DialogDataset(
    file_path="processed_dialogs.txt",
    tokenizer=tokenizer,
    max_length=128
)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetunedV2",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./gpt2-finetunedV2")
tokenizer.save_pretrained("./gpt2-finetunedV2")

  0%|          | 0/20775 [00:00<?, ?it/s]

{'loss': 2.3665, 'grad_norm': 4.240726470947266, 'learning_rate': 4.879663056558363e-05, 'epoch': 0.02}
{'loss': 2.2709, 'grad_norm': 2.9654688835144043, 'learning_rate': 4.759326113116727e-05, 'epoch': 0.05}
{'loss': 2.2172, 'grad_norm': 3.0672502517700195, 'learning_rate': 4.63898916967509e-05, 'epoch': 0.07}
{'loss': 2.2186, 'grad_norm': 3.00846004486084, 'learning_rate': 4.518652226233454e-05, 'epoch': 0.1}
{'loss': 2.2369, 'grad_norm': 3.286755323410034, 'learning_rate': 4.398315282791817e-05, 'epoch': 0.12}
{'loss': 2.2261, 'grad_norm': 2.3606505393981934, 'learning_rate': 4.277978339350181e-05, 'epoch': 0.14}
{'loss': 2.1868, 'grad_norm': 3.0841352939605713, 'learning_rate': 4.1576413959085445e-05, 'epoch': 0.17}
{'loss': 2.1673, 'grad_norm': 2.4785640239715576, 'learning_rate': 4.0373044524669075e-05, 'epoch': 0.19}
{'loss': 2.1476, 'grad_norm': 2.0501327514648438, 'learning_rate': 3.916967509025271e-05, 'epoch': 0.22}
{'loss': 2.1452, 'grad_norm': 2.8682167530059814, 'learning

('./gpt2-finetunedV2\\tokenizer_config.json',
 './gpt2-finetunedV2\\special_tokens_map.json',
 './gpt2-finetunedV2\\vocab.json',
 './gpt2-finetunedV2\\merges.txt',
 './gpt2-finetunedV2\\added_tokens.json')

In [11]:
# Load fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetunedV2")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetunedV2")

# Prepare input
input_text = "<genre:comedy,romance><char:BIANCA><char:CAMERON>\nBIANCA: They're having a fight... again."

# Encode input
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate output
output = model.generate(input_ids, max_length=200, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode and print the result
print(tokenizer.decode(output[0], skip_special_tokens=False))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<genre:comedy,romance><char:BIANCA><char:CAMERON>
BIANCA: They're having a fight... again.
CAMS: I'm sorry, I don't know.  I just thought I'd ask you something. I mean, you're a very nice guy, but you donï¿½t know how to handle it. Youïve got a lot of problems, and youïll have to deal with them. But Iïm not going to let you get away with it, do you?
BANCHO: No, no, not at all. Itïs just that I have a little problem with the way you treat me. And I think you should go to the police. Theyïre going after you. If you do, they're going back to jail. Thatïd be a good thing. The police are going out to get you, too. So I'll just go


# Model V2 - 3 Epochs

In [None]:
# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

class DialogDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        with open(file_path, "r") as f:
            self.dialogs = f.read().split("\n\n")
        
    def __len__(self):
        return len(self.dialogs)
    
    def __getitem__(self, idx):
        dialog = self.dialogs[idx]
        encodings = self.tokenizer(dialog, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {key: torch.squeeze(val) for key, val in encodings.items()}
    
# Prepare your dataset
train_dataset = DialogDataset(
    file_path="processed_dialogs.txt",
    tokenizer=tokenizer,
    max_length=128
)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetunedV2-3",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./gpt2-finetunedV2-3")
tokenizer.save_pretrained("./gpt2-finetunedV2-3")

  0%|          | 0/62325 [00:00<?, ?it/s]

{'loss': 2.3664, 'grad_norm': 4.189531326293945, 'learning_rate': 4.959887685519455e-05, 'epoch': 0.02}
{'loss': 2.2713, 'grad_norm': 2.9324004650115967, 'learning_rate': 4.9197753710389094e-05, 'epoch': 0.05}
{'loss': 2.2188, 'grad_norm': 3.052821159362793, 'learning_rate': 4.879663056558363e-05, 'epoch': 0.07}
{'loss': 2.2199, 'grad_norm': 2.948228597640991, 'learning_rate': 4.839550742077818e-05, 'epoch': 0.1}
{'loss': 2.2387, 'grad_norm': 3.591900110244751, 'learning_rate': 4.7994384275972725e-05, 'epoch': 0.12}
{'loss': 2.2278, 'grad_norm': 2.323040246963501, 'learning_rate': 4.759326113116727e-05, 'epoch': 0.14}
{'loss': 2.1895, 'grad_norm': 3.0196328163146973, 'learning_rate': 4.7192137986361816e-05, 'epoch': 0.17}
{'loss': 2.1706, 'grad_norm': 2.442878484725952, 'learning_rate': 4.6791014841556355e-05, 'epoch': 0.19}
{'loss': 2.1507, 'grad_norm': 1.990504264831543, 'learning_rate': 4.63898916967509e-05, 'epoch': 0.22}
{'loss': 2.1482, 'grad_norm': 2.855936050415039, 'learning_r

In [None]:
# Load fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetunedV2-3")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetunedV2-3")

# Prepare input
input_text = "<genre:comedy,romance><char:BIANCA><char:CAMERON>\nBIANCA: They're having a fight... again."

# Encode input
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate output
output = model.generate(input_ids, max_length=200, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode and print the result
print(tokenizer.decode(output[0], skip_special_tokens=False))