In [6]:
import tensorflow as tf
import pandas as pd
#import gpt_2_simple as gpt2
from datetime import datetime
from transformers import GPT2Tokenizer,GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
import os
#import huggingface_hub

In [7]:
scripts = pd.read_csv('seinfeld_data/scripts.csv')
scripts['line'] = scripts.Character + ': ' + scripts.Dialogue
#scripts.line[-1:] = scripts.line[-1:]  + ' <|endoftext|>'


In [8]:
class seinfeld_lines(Dataset):  
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lines = []

        for row in scripts['line']:
          self.lines.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row}<|endoftext|>")
            ))               
        if truncate:
            self.lines = self.lines[:20000]
        self.lines_count = len(self.lines)
        
    def __len__(self):
        return self.lines_count

    def __getitem__(self, item):
        return self.lines[item]
    
dataset = seinfeld_lines(scripts['line'], truncate=True, gpt2_type="gpt2")


In [9]:
scripts.line.tolist()

['JERRY: Do you know what this is all about? Do you know, why were here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about We should go out? This is what theyre talking about...this whole thing, were all out now, no one is home. Not one person here is home, were all out! There are people tryin to find us, they dont know where we are. (on an imaginary phone) Did you ring?, I cant find him. Where did he go? He didnt tell me where he was going. He must have gone out. You wanna go out you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then youre standing around, whatta you do? You go We gotta be getting back. Once youre out, you wanna get back! You wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right? Where ever you are in life, its my feeling, youve gotta go.',
 'JERRY: (poin

In [10]:
scripts.line = scripts.line.astype(str)

In [11]:
run_name = 'fine_tuning_run_1'
model_size = '124M'

In [12]:
scripts[-1:]

Unnamed: 0.1,Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,line
54615,54615,JERRY,"Alright, hey, you've been great! See you in th...",23.0,S09E23,9.0,"JERRY: Alright, hey, you've been great! See yo..."


In [14]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',padding_side='left')
model = GPT2LMHeadModel.from_pretrained('gpt2')


ImportError: 
GPT2LMHeadModel requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFGPT2LMHeadModel".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [None]:

def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=8, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [None]:
model = train(dataset, model, tokenizer)

In [None]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=1,
    entry_length=100, #maximum number of words
    top_p=0.9,
    temperature=.75,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_lines = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Character'][i], entry_count=1)
    generated_lines.append(x)
  return generated_lines

In [None]:
#Run the functions to generate the lines
jerry = scripts[scripts.Character=='JERRY'].reset_index()
george = scripts[scripts.Character=='GEORGE'].reset_index()
elaine = scripts[scripts.Character=='ELAINE'].reset_index()
kramer = scripts[scripts.Character=='KRAMER'].reset_index(),
frank = scripts[scripts.Character=='FRANK'].reset_index()
newman = scripts[scripts.Character=='NEWMAN'].reset_index()
estelle = scripts[scripts.Character=='ESTELLE'].reset_index()

In [None]:
george_elaine=[]
george_elaine.append(generate(model,tokenizer,prompt='Elaine: (to George) have you tried the new dating app that matches you with other people who'))
george_elaine.append(generate(model, tokenizer, prompt="George: it didnt work because"))
print(george_elaine)

In [None]:
# Import the huggingface library
#import huggingface

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Preprocess your text data
processed_data = scripts['line']

# Split the data into training and validation sets
#train_data, val_data = train_test_split(processed_data, test_size=0.2)

# Choose a model architecture and training parameters
model_params = {
    'batch_size': 64,
    'learning_rate': 1e-5,
    'num_epochs': 1
}

# Train the model on the training data
model.train(processed_data, model_params , tokenizer)

# Evaluate the model's performance on the validation set
#val_loss = model.evaluate(val_data)
#print(f'Validation Loss: {val_loss:.4f}')

# Generate an entire script for an episode of Seinfeld
generated_script = model.generate("Generate an entire script for an episode of Seinfeld, set in a coffee shop, featuring Jerry, Elaine, and George.")
print(generated_script)


In [None]:
model.generate()

In [None]:
text_generation(estelle)