In [43]:
import tensorflow as tf
import pandas as pd
#import gpt_2_simple as gpt2
from datetime import datetime
from transformers import GPT2Tokenizer,GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
import os
#import huggingface_hub

In [44]:
scripts = pd.read_csv('seinfeld_data/scripts.csv')
scripts['line'] = scripts.Character + ': ' + scripts.Dialogue
#scripts.line[-1:] = scripts.line[-1:]  + ' <|endoftext|>'


In [63]:
class SeinfeldLines(tf.data.Dataset):
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lines = []

        for row in scripts['line'].tolist():
            self.lines.append(self.tokenizer.encode(f"<|{control_code}|>{row}"))

        if truncate:
            self.lines = self.lines[:20000]

        # Pad the elements of the self.lines list with a special padding token
        max_length = max(len(line) for line in self.lines)
        self.lines = [line + [self.tokenizer.pad_token_id] * (max_length - len(line)) for line in self.lines]
        self.lines = tf.convert_to_tensor(self.lines, dtype=tf.int64)
        self.lines_count = len(self.lines)

    def __len__(self):
        return self.lines_count

    def __getitem__(self, item):
        return tf.gather(self.lines, item)

    def _inputs(self):
        return []

    def element_spec(self):
        return tf.TensorSpec(shape=(None,), dtype=tf.int64)

dataset = SeinfeldLines(scripts['line'], truncate=True, gpt2_type="gpt2")



ValueError: Can't convert Python sequence with mixed types to Tensor.

In [64]:
scripts.line.tolist()

['JERRY: Do you know what this is all about? Do you know, why were here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about We should go out? This is what theyre talking about...this whole thing, were all out now, no one is home. Not one person here is home, were all out! There are people tryin to find us, they dont know where we are. (on an imaginary phone) Did you ring?, I cant find him. Where did he go? He didnt tell me where he was going. He must have gone out. You wanna go out you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then youre standing around, whatta you do? You go We gotta be getting back. Once youre out, you wanna get back! You wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right? Where ever you are in life, its my feeling, youve gotta go.',
 'JERRY: (poin

In [54]:
scripts.line = scripts.line.astype(str)

In [5]:
run_name = 'fine_tuning_run_1'
model_size = '124M'

In [6]:
scripts[-1:]

Unnamed: 0.1,Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,line
54615,54615,JERRY,"Alright, hey, you've been great! See you in th...",23.0,S09E23,9.0,"JERRY: Alright, hey, you've been great! See yo..."


In [7]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',padding_side='left')
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [29]:

def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=1, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    acc_steps = 100
    device=tf.device('device:GPU:0')
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = tf.convert_to_tensor(input_tensor)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [37]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=1, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False, save_model_on_epoch=False,
):
    acc_steps = 100
    model.train()

    # Use the Adam optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    # Use the PolynomialDecay learning rate scheduler
    scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=lr,
        decay_steps=warmup_steps,
        end_learning_rate=0.0
    )

    # Create a dataset from the training data
    train_dataset = tf.data.Dataset.from_tensor_slices(dataset)

    # Batch the data and prefetch the next batch
    train_dataset = train_dataset.batch(batch_size).prefetch(1)

    loss = 0
    accumulating_batch_count = 0

    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        print(loss)
        for input_tensor in tqdm(train_dataset):
            # Place the tensors and model on the GPU
            with tf.device('/device:GPU:0'):
                input_tensor = tf.convert_to_tensor(input_tensor, dtype=tf.int64)
                outputs = model(input_tensor, labels=input_tensor)
                loss = outputs[0]
                loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.apply_gradients(zip(model.trainable_variables, model.trainable_variables))
                scheduler.step()
                model.zero_grad()

            accumulating_batch_count += 1

        if save_model_on_epoch:
            # Save the model weights
            model.save_weights(os.path.join(output_dir, f"{output_prefix}-{epoch}.h5"))

    return model


In [38]:
model = train(dataset, model, tokenizer)

ValueError: Attempt to convert a value (tensor([   27,    91,    15,   220,   220,   220,   220,   220,   220,   220,
          449,  1137, 18276,    25,  2141,   345,   760,   644,   428,   318,
          477,   546,    30,  2141,  2644,   198,    16,   220,   220,   220,
          220,   220,   220,   220,   449,  1137, 18276,    25,   357,  4122,
          278,   379,  6850,   274, 10147,     8,  4091,    11,   284,   502,
        42303,   198,    17,   220,   220,   220,   220,   220,   220,   220,
          220,   220,   220,   220,   220,   220,   220,   220,   220,   220,
          220,   220,   220,   220,   220,   220,   220,   220,   220,   220,
          220,   220,   220,   220,   220, 22319, 49697,    25,  4231,   345,
          832,    30,   198,    18,   220,   220,   220,   220,   220,   220,
          220,   220,   220,   220,   220,   449,  1137, 18276,    25,   921,
          466,   286,  1781,  1949,   319,    11,   618,   345,  2822,    30,
          198,    19,   220,   220,   220,   220,   220,   220,   220, 22319,
        49697,    25,  3363,    11,   340,   373, 14032,    11,   314,  8288,
          340,    11,   314, 17666,   986,   198,   220,   220,   220,   220,
          220,   220,   220,   220,   220,   220,   220,   220,   220,   220,
          220,   220,   220,   220,   220,   220,   220,   220,   220,   220,
          220,   220,   220,   220,   220,   220,  2644,   220,   220,   220,
          220,   220,   220,   220,   220,   220,   220,   220,   220,   220,
          220,   220,   220,   220,   220,   220,   220,   220,   220,   220,
          220,   198, 49489,  1157,   220,   220,   220,   449,  1137, 18276,
           25,  5675, 12402,  8295,   532,   836,   470,  8711,   597,   286,
          285,   986,   198, 49489,  1065,   220,   220,   220,   220,   220,
          220,   220,   220,   220,   220,   220,  4810, 39960,  1137,   513,
           25,   921, 10110,   532,   314,  1101,  8066,  2005,   345,    13,
          198, 49489,  1485,   220,   220,   220,   449,  1137, 18276,    25,
        14690,    11,   314,   836,   470,  1282,   866,   284,   810,   345,
          476,   986,   198, 49489,  1415,   220,   220,   220, 19348,  9795,
           25, 40967,    11,  1001, 47187,    11,   326,   338,   340,    13,
         3914,   338,   467,  1106,   198, 49489,  1314,   220,   220,   220,
          449,  1137, 18276,    25, 40967,    11, 17207,    11,   345,  1053,
          587,  1049,     0,  4091, 27406,   986,   198,  5376,    25,  1627,
           11, 22313,    25,   642,  3510,  1433,    11,   288,  4906,    25,
         2134,    91,    29,    41,  1137, 18276,    25,  2141,   345,   760,
          644,   428,   318,   477,   546,    30,  2141,   345,   760,    11,
         1521,   547,   994,    30,  1675,   307,   503,    11,   428,   318,
          503,   986,   392,   503,   318,   530,   286,   262,  2060,   749,
        20050,  6461,   286,  1204,    13,  4380,   986, 20839,   345,  1683,
         3285,   661,  3375,   546,   775,   815,   467,   503,    30,   770,
          318,   644,   484,   260,  3375,   546,   986,  5661,  2187,  1517,
           11,   547,   477,   503,   783,    11,   645,   530,   318,  1363,
           13,  1892,   530,  1048,   994,   318,  1363,    11,   547,   477,
          503,     0,  1318,   389,   661,  1949,   259,   284,  1064,   514,
           11,   484, 17666,   760,   810,   356,   389,    13,   357,   261,
          281, 26726,  3072,     8,  7731,   345,  5858, 21747,   314, 18548,
         1064,   683,    13,  6350,   750,   339,   467,    30,   679, 42547,
         1560,   502,   810,   339,   373,  1016,    13,   679,  1276,   423,
         3750,   503,    13,   921, 18869,   467,   503,   345,   651,  3492,
           11,   345,  2298,   503,   262,  8242,    11,   826,    30,   921,
         1011,   262, 14643,    11,   345,   651,   477,  3492,    11,   651,
          262,  5003,    11,   651,   534,  2460,    11,   262,  1097,    11,
          262,  4136,    11,   262, 24048,   986,  6423,   345,   260,  5055,
         1088,    11,   644,  8326,   345,   466,    30,   921,   467,   775,
        17753,   307,  1972,   736,    13,  4874,   345,   260,   503,    11,
          345, 18869,   651,   736,     0,   921, 18869,   467,   284,  3993,
           11,   345, 18869,   651,   510,    11,   345, 18869,   467,   503,
          757,  9439,    11,   826,    30,  6350,  1683,   345,   389,   287,
         1204,    11,   663,   616,  4203,    11,   345,   303, 17753,   467,
           13, 50256])) with an unsupported type (<class 'torch.Tensor'>) to a Tensor.

In [None]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=1,
    entry_length=100, #maximum number of words
    top_p=0.9,
    temperature=.75,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_lines = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Character'][i], entry_count=1)
    generated_lines.append(x)
  return generated_lines

In [None]:
#Run the functions to generate the lines
jerry = scripts[scripts.Character=='JERRY'].reset_index()
george = scripts[scripts.Character=='GEORGE'].reset_index()
elaine = scripts[scripts.Character=='ELAINE'].reset_index()
kramer = scripts[scripts.Character=='KRAMER'].reset_index(),
frank = scripts[scripts.Character=='FRANK'].reset_index()
newman = scripts[scripts.Character=='NEWMAN'].reset_index()
estelle = scripts[scripts.Character=='ESTELLE'].reset_index()

In [None]:
george_elaine=[]
george_elaine.append(generate(model,tokenizer,prompt='Elaine: (to George) have you tried the new dating app that matches you with other people who'))
george_elaine.append(generate(model, tokenizer, prompt="George: it didnt work because"))
print(george_elaine)

100%|██████████| 1/1 [00:14<00:00, 14.58s/it]
100%|██████████| 1/1 [00:12<00:00, 12.88s/it]

[['Elaine: (to George) have you tried the new dating app that matches you with other people who are interested in you? George: (to her) uh uh uh. I\'ve been looking for a girl with some sort of artistic flair who is willing to talk to you, but I don\'t know. I\'m just a big guy who likes to chat, and I want to make sure that I\'m getting the same kind of response that you get. (pause) This is so odd, George. You\'ve seen a lot of young men come up to me and say, "Hey,<|endoftext|>'], ["George: it didnt work because of a bug in the controller...the game couldn't do anything about it, so it was basically just a cheap way to have a tutorial on how to play.\n\n\nmatthew: i had to change the whole setup because the controller was just too much. the only thing i had to change was the controller itself, so it made me feel less alone. i had to do a lot of crazy things to make this a game that was so fun to play.\n\n\nvlad:<|endoftext|>"]]





In [None]:
# Import the huggingface library
#import huggingface

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Preprocess your text data
processed_data = scripts['line']

# Split the data into training and validation sets
#train_data, val_data = train_test_split(processed_data, test_size=0.2)

# Choose a model architecture and training parameters
model_params = {
    'batch_size': 64,
    'learning_rate': 1e-5,
    'num_epochs': 1
}

# Train the model on the training data
model.train(processed_data, model_params , tokenizer)

# Evaluate the model's performance on the validation set
#val_loss = model.evaluate(val_data)
#print(f'Validation Loss: {val_loss:.4f}')

# Generate an entire script for an episode of Seinfeld
generated_script = model.generate("Generate an entire script for an episode of Seinfeld, set in a coffee shop, featuring Jerry, Elaine, and George.")
print(generated_script)


TypeError: train() takes from 1 to 2 positional arguments but 4 were given

In [None]:
model.generate()

<__main__.seinfeld_lines at 0x7f5b978eb460>

In [None]:
text_generation(estelle)

100%|██████████| 1/1 [00:03<00:00,  3.22s/it]
100%|██████████| 1/1 [00:02<00:00,  2.59s/it]
100%|██████████| 1/1 [00:02<00:00,  2.64s/it]
100%|██████████| 1/1 [00:02<00:00,  2.61s/it]
100%|██████████| 1/1 [00:02<00:00,  2.61s/it]
100%|██████████| 1/1 [00:02<00:00,  2.20s/it]
100%|██████████| 1/1 [00:02<00:00,  2.58s/it]
100%|██████████| 1/1 [00:01<00:00,  1.55s/it]
100%|██████████| 1/1 [00:02<00:00,  2.58s/it]
100%|██████████| 1/1 [00:02<00:00,  2.59s/it]
100%|██████████| 1/1 [00:02<00:00,  2.58s/it]
100%|██████████| 1/1 [00:02<00:00,  2.59s/it]
100%|██████████| 1/1 [00:02<00:00,  2.59s/it]
100%|██████████| 1/1 [00:02<00:00,  2.58s/it]
100%|██████████| 1/1 [00:02<00:00,  2.61s/it]
100%|██████████| 1/1 [00:02<00:00,  2.71s/it]
100%|██████████| 1/1 [00:02<00:00,  2.59s/it]
100%|██████████| 1/1 [00:02<00:00,  2.59s/it]
100%|██████████| 1/1 [00:02<00:00,  2.57s/it]
100%|██████████| 1/1 [00:02<00:00,  2.56s/it]
100%|██████████| 1/1 [00:00<00:00,  7.79it/s]
100%|██████████| 1/1 [00:02<00:00,

[['ESTELLE VS. HIGH-TECH (DESIGNER: ROBERT CRUZELMAN)\n\nVALERIE VS. MATT SCAL<|endoftext|>'],
 ['ESTELLE, who has been in "management" as by appointment, from now on is an "owner/operator," to whom she has no say. She<|endoftext|>'],
 ["ESTELLE: All right, let's go.\n\nJUROR: Yes, please. I want you to watch it.\n\nSAVAGE<|endoftext|>"],
 ['ESTELLE: You listen to her do?"\n\nMOYER: Yes.\n\n[giggles]\n\nKRAMER: Yes,<|endoftext|>'],
 ['ESTELLE: Tonight on Freak Show with Jay Leno, we talk about a new album called NEONER AND ASIAN WOMEN, about co-author<|endoftext|>'],
 ["ESTELLE, HILLARY (R), BURKE/JERRY: You do what you do best. Nobody's perfect.<|endoftext|>"],
 ['ESTELLE: I do know that this is very funny, when I heard this, I said, of course, if they give you money, you get nothing<|endoftext|>'],
 ['ESTELLE, Mich. — Nick Griffin can imagine himself at home in the shiny new Volkswagen Polo.<|endoftext|>'],
 ['ESTELLE: What do you think? Is this with us?\n\nSTEWART: Good. First, I wa