## Setup

In [2]:
import os
import time
import chardet
import re
import ast
import pickle
import itertools
import pandas as pd
import numpy as np
import torch
import math
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import pipeline, set_seed
from transformers.trainer_callback import EarlyStoppingCallback

pd.set_option("display.max_colwidth", None)

In [3]:
# Check if GPU is available and if so, use it
print(torch.cuda.is_available())
print(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

False
cpu


In [4]:
DATASET_FILENAME = "movie_conversations.pkl"
MAX_TOKEN_COUNT = 1024
BASE_MODEL_NAME = "gpt2"
dataset_generated = False

if os.path.exists(DATASET_FILENAME):
    dataset_generated = True

In [5]:
# Define tokenizer and model here to avoid mismatchs or reinitialization later on
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(BASE_MODEL_NAME)

# Set pad token for the tokenizer
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
assert gpt2_tokenizer.pad_token == gpt2_tokenizer.eos_token

# Add user and agent role tags as special tokens
gpt2_tokenizer.add_special_tokens({
    "additional_special_tokens": ["<USER>", "<AGENT>"]
})

2

## Preprocessing

In [6]:
if not dataset_generated:
    # Detect encoding of the movie lines file
    cwd = os.getcwd()

    # with open(cwd + "/movie_lines/movie_lines.txt", "rb") as f:
    #     result = chardet.detect(f.read())
    #     movie_lines_encoding = result["encoding"]

    # with open(cwd + "/movie_lines/movie_conversations.txt", "rb") as f:
    #     result = chardet.detect(f.read())
    #     movie_conversations_encoding = result["encoding"]

    # print(movie_lines_encoding)
    # print(movie_conversations_encoding)
    movie_lines_encoding = "Windows-1252"
    movie_conversations_encoding = "ascii"

In [7]:
if not dataset_generated:
    # Collect individual movie lines
    with open("/content/movie_lines.txt", "r", encoding=movie_lines_encoding) as f:
        content = f.read()

    lines = content.split("\n")

    # Print first 5 lines and verify length is correct
    print(lines[:5])

    # Remove last element of lines because its an empty string
    lines = lines[:-1]
    print(len(lines))
    print(lines)

In [8]:
def clean_text(t):
    # Remove everything that comes after INT or EXT (these are scene descriptors and not dialogue)
    t = re.sub(r'(INT|EXT).*$', '', t)

    # Remove the tags and the � character
    t = re.sub(r'<[/]?[UuIiBb]>', '', t)
    t = re.sub('�', '', t)

    return t

test_text = [
    "You got a telegram from head�quarters today.",
    "Is there <U>anything</U> we can do?!",
    "<i>Mr. Cigliuti?</i>",
    "<i>You'll</i> take him.",
    "And his 'egghead' son!  We'll give 'em a <u>rough</u> <u>reception</u>, won't we?",
    "What are you doing? What are thev doino? ~7C INT. SARRIS' SHIP	h37C",
    "Top of the World at the Stratosphere. It's completely secure. <b> INT. FBI JET -- NEXT </b> Cosgrove and Espinoza listen to this conversation."
]


for t in test_text:
    print(clean_text(t))

You got a telegram from headquarters today.
Is there anything we can do?!
Mr. Cigliuti?
You'll take him.
And his 'egghead' son!  We'll give 'em a rough reception, won't we?
What are you doing? What are thev doino? ~7C 
Top of the World at the Stratosphere. It's completely secure.  


In [9]:
if not dataset_generated:
    # Initialize containers for values to put in dataframe
    line_numbers_dict = {}
    missing_line_ids = []
    is_empty = 0

    for line in lines:
        # Split on whitespace
        split = line.split(" ")
        line_number = split[0]

        # Extract the text after the last "+" character
        l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]
        cleaned = clean_text(l)
        if len(cleaned) < 1:
            missing_line_ids.append(line_number)
            is_empty += 1
        else:
            line_numbers_dict[line_number] = cleaned

    print(f"Total empty entries: {is_empty}")

    # Create dataframe from extracted values
    print(dict(itertools.islice(line_numbers_dict.items(), 10)))

In [10]:
if not dataset_generated:
    # Collect movie conversation lists
    with open("/content/movie_conversations.txt", "r", encoding=movie_conversations_encoding) as f:
        content = f.read()

    lines = content.split("\n")

    # Print first 5 lines and verify length is correct
    print(lines[:5])

    # Remove last element of lines because its an empty string
    lines = lines[:-1]
    print(len(lines))

In [11]:
if not dataset_generated:
    # Initialize containers for values to put in dataframe
    conversation_lines = []

    for line in lines:
        # Split on whitespace
        split = line.split(" ")
        speaker1_id = split[0]
        speaker2_id = split[2]

        # One movie script has scene descriptors stored as character lines, so skip these
        if speaker1_id == "u4464" or speaker2_id == "u4464":
            continue

        # Extract the text after the last "+" character
        l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]
        l = ast.literal_eval(l)

        # Check if the conversation has one of the missing line ids, if so, skip that conversation
        has_missing_content = any(item in l for item in missing_line_ids)
        if not has_missing_content:
            conversation_lines.append(l)

    print(conversation_lines[:10])

    # Create dataframe from extracted values
    movie_conversations = pd.DataFrame({"conversation_lines": conversation_lines})
    display(movie_conversations.head())
    movie_conversations.info()

In [12]:
# Function for turning movie lines into multi-turn conversations for training
def create_conversation_turns(row):
    conversation_list = row["conversation_lines"]
    convo = ""
    # For each line, add it to the conversation with a role label
    for i, line_id in enumerate(conversation_list):
        movie_line = line_numbers_dict[line_id]

        # Ensure the conversations end with agent response
        if i == len(conversation_list) - 1 and i % 2 == 0:
            continue

        if i%2 == 0:
            convo += f"<USER> {movie_line} "
        else:
            convo += f"<AGENT> {movie_line} "

    return convo

In [13]:
if not dataset_generated:
    # Try it on a sample for testing
    sample = movie_conversations.copy().iloc[:2]
    sample["conversation"] = sample.apply(create_conversation_turns, axis=1)
    display(sample)

    # Call create_conversation_turns on every row of the dataframe
    movie_conversations["conversation"] = movie_conversations.apply(create_conversation_turns, axis=1)

In [14]:
if not dataset_generated:
    display(movie_conversations.head())
    print(movie_conversations["conversation"].head(1).values[0])

In [15]:
if not dataset_generated:
    # Drop conversation_lines because it's not needed anymore
    movie_conversations.drop(columns=["conversation_lines"], inplace=True)

In [16]:
if not dataset_generated:
    # Check for duplicates
    duplicates = movie_conversations[movie_conversations.duplicated(keep='first')]
    print(f"Found {len(duplicates)} duplicates")
    print(duplicates.head(10))

    movie_conversations.drop_duplicates(inplace=True)
    movie_conversations.info()

In [17]:
if not dataset_generated:
    # Get the token count for each conversation so we can split the ones that are too long
    movie_conversations["token_count"] = movie_conversations["conversation"].apply(lambda x: len(gpt2_tokenizer.encode(x)))
    movie_conversations.head()

In [18]:
# Examine distribution of token count
if not dataset_generated:
    display(movie_conversations["token_count"].describe())
    plt.hist(movie_conversations["token_count"])
    plt.show()

In [19]:
# Drop all conversations that exceed the token limit
if not dataset_generated:
    movie_conversations = movie_conversations[movie_conversations["token_count"] < MAX_TOKEN_COUNT]
    movie_conversations.info()
    movie_conversations.drop(columns=["token_count"], inplace=True)

In [20]:
# Test splitting on last agent line
if not dataset_generated:
    test_string = movie_conversations["conversation"].head(1).values[0]
    print(test_string)
    splits = test_string.split('<AGENT>')
    response = splits[-1].strip()
    input = '<AGENT>'.join(splits[:-1]).strip() + "<AGENT> "
    print("Context:\n", input)
    print("Response:\n", response)
    for split in splits:
        print(split)


In [21]:
# For the test and evaluation sets, cut off the last agent response to create expected output to compare against for eval
def create_response(text):
    splits = text.split("<AGENT>")
    input = "<AGENT>".join(splits[:-1]).strip() + " <AGENT>"
    response = splits[-1].strip()

    return pd.Series([input, response])

if not dataset_generated:
    movie_conversations[["prompt", "completion"]] = movie_conversations["conversation"].apply(create_response)
    display(movie_conversations.head())
    movie_conversations.drop(columns=["conversation"], inplace=True)


In [22]:
# Serialize dataset for faster loading
if not dataset_generated:
    with open(DATASET_FILENAME, "wb") as f:
        print("Writing dataframe to file")
        pickle.dump(movie_conversations, f)
else:
    with open(DATASET_FILENAME, "rb") as f:
        print("Loading dataframe from file")
        movie_conversations = pickle.load(f)

display(movie_conversations.head())
print("Prompt:")
print(movie_conversations["prompt"].head(1).values[0])
print("Completion:")
print(movie_conversations["completion"].head(1).values[0])

Loading dataframe from file


Unnamed: 0,prompt,completion
0,"<USER> Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again. <AGENT> Well, I thought we'd start with pronunciation, if that's okay with you. <USER> Not the hacking and gagging and spitting part. Please. <AGENT>",Okay... then how 'bout we try out some French cuisine. Saturday? Night?
1,<USER> You're asking me out. That's so cute. What's your name again? <AGENT>,Forget it.
2,"<USER> No, no, it's my fault -- we didn't have a proper introduction --- <AGENT> Cameron. <USER> The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does. <AGENT>",Seems like she could get a date easy enough...
3,<USER> Why? <AGENT>,"Unsolved mystery. She used to be really popular when she started high school, then it was just like she got sick of it or something."
4,"<USER> Gosh, if only we could find Kat a boyfriend... <AGENT>",Let me see what I can do.


Prompt:
<USER> Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. <AGENT> Well, I thought we'd start with pronunciation, if that's okay with you. <USER> Not the hacking and gagging and spitting part.  Please. <AGENT>
Completion:
Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?


In [23]:
# Convert dataframe to dataset
movie_conversations_dataset = Dataset.from_pandas(movie_conversations, preserve_index=False)

In [24]:
# Define method for tokenizing prompt and completion
def tokenize_function(dataset):
    input_data = gpt2_tokenizer(dataset["prompt"], return_tensors="pt", padding='max_length', max_length=1024)
    labels_data = gpt2_tokenizer(dataset["completion"], return_tensors="pt", padding='max_length', max_length=1024)
    return {
        "input_ids": input_data["input_ids"],
        "attention_mask": input_data["attention_mask"],
        "labels": labels_data["input_ids"]
    }

movie_conversations_tokenized = movie_conversations_dataset.map(tokenize_function, batched=True)
print(movie_conversations_tokenized)

Map:   0%|          | 0/82601 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 82601
})


In [25]:
# Split dataset into test, evaluation, and training sets
train_test_split = movie_conversations_tokenized.train_test_split(train_size=0.8, seed=42)
train_data = train_test_split["train"]
temp_data = train_test_split["test"]

eval_test_split = temp_data.train_test_split(train_size=0.5, seed=42)
eval_data = eval_test_split["train"]
test_data = eval_test_split["test"]

print(train_data)
print(eval_data)
print(test_data)

Dataset({
    features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 66080
})
Dataset({
    features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 8260
})
Dataset({
    features: ['prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 8261
})


In [26]:
# Examine tokenized example
sample = train_data[0]
print(sample["prompt"])
print(sample["input_ids"])
print(sample["attention_mask"])
print(sample["completion"])
print(sample["labels"])

<USER> We had a fight.  Some things got broken, the dog went crazy, she left me.  Wouldn't say where she was going. <AGENT> What was the fight about? <USER> Take a guess. <AGENT>
[50257, 1135, 550, 257, 1907, 13, 220, 2773, 1243, 1392, 5445, 11, 262, 3290, 1816, 7165, 11, 673, 1364, 502, 13, 220, 43048, 470, 910, 810, 673, 373, 1016, 13, 50258, 2061, 373, 262, 1907, 546, 30, 50257, 12322, 257, 4724, 13, 50258, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256

## Fine-tuning

In [27]:
# Create data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False)

In [28]:
# Configure training argumets for the trainer and method for evaluation
checkpoint_dir_name = "gpt2-trainer"
training_args = TrainingArguments(
    output_dir=checkpoint_dir_name,
    #overwrite_output_dir=True,
    evaluation_strategy="epoch",
    #save_strategy="epoch",
    #per_device_eval_batch_size=8,
    #per_device_train_batch_size=8,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    #bf16=True,
    #fp16=True,
    #load_best_model_at_end=True,
    #gradient_accumulation_steps=16,
    no_cuda=False
)


In [29]:
# Create smaller datasets for testing
small_train_dataset = train_data.shuffle(seed=42).select(range(10000))
small_eval_dataset = eval_data.shuffle(seed=42).select(range(5000))

In [30]:
# Reset the model
gpt2_model = GPT2LMHeadModel.from_pretrained(BASE_MODEL_NAME)
gpt2_model.resize_token_embeddings(len(gpt2_tokenizer))

# Run the trainer
trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=small_eval_dataset,
    data_collator=data_collator,
)

#trainer.train()

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50259. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


In [31]:
# Export the trained model
output_dir = "./trained-model/"
gpt2_model.save_pretrained(output_dir)
gpt2_tokenizer.save_pretrained(output_dir)

('./trained-model/tokenizer_config.json',
 './trained-model/special_tokens_map.json',
 './trained-model/vocab.json',
 './trained-model/merges.txt',
 './trained-model/added_tokens.json')

In [32]:
# Response generation
sample_prompt = test_data[1]["prompt"]
sample_prompt = "<User> Is there a god?"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
gpt2_model = gpt2_model.to(device)

input_ids = gpt2_tokenizer.encode(sample_prompt, return_tensors="pt")
input_ids = input_ids.to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output = gpt2_model.generate(
    input_ids,
    attention_mask=attention_mask,
    temperature=0.3,
    max_length=50,
    num_return_sequences=3,
    #num_beams=5,
    repetition_penalty=1.5,
    top_p=0.9,
    do_sample=True
)

#print(output)
# generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
# print(f"Prompt: {sample_prompt}")
# print(f"Generated Response: {generated_text}")
print(f"Prompt: {sample_prompt}")
for i, sequence in enumerate(output):
    generated_text = gpt2_tokenizer.decode(sequence, skip_special_tokens=True)
    #generated_response = generated_text.replace(sample_prompt, "").strip()
    print(f"Generated Response {i+1}: {generated_text}\n")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: <User> Is there a god?
Generated Response 1: <User> Is there a god?

Generated Response 2: <User> Is there a god?

Generated Response 3: <User> Is there a god?



In [33]:
# Create small dataset for testing the model's performance with quantitative metrics
small_test_dataset = test_data.shuffle(seed=42).select(range(5000))

In [34]:
# Run the trainer in evaluation mode
training_args = TrainingArguments(
    output_dir='gpt2-eval',
    per_device_eval_batch_size=8,
    logging_dir='./logs',
)

trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    eval_dataset=small_test_dataset,
)

# Get loss from the results
#eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
print(eval_results)

NameError: name 'eval_results' is not defined

In [None]:
print(eval_results)

{'eval_loss': 17.202417373657227, 'eval_runtime': 132.568, 'eval_samples_per_second': 37.716, 'eval_steps_per_second': 4.715}


In [None]:
# Calculate perplexity from loss
perplexity = math.exp(eval_loss)
print(perplexity)

29574331.784926858


In [None]:
#Simulating sending a request (start the timer)
start_time = time.time()

#Simulating receiving a response (stop the timer)
end_time = time.time()

#Calculating the elapsed time
response_time = end_time - start_time

#Printing the response time
print(f"Response time: {response_time:.2f} seconds")

## Building the chatbot

In [37]:
loaded_model = GPT2LMHeadModel.from_pretrained("models/final-model/")
loaded_tokenizer = GPT2Tokenizer.from_pretrained("models/final-model/")

In [124]:
# Define a helper function from removing unfinished sentences from generated text
def remove_unfinished_sentences(response):
    # Define punctuation to look for
    punctuation = set(".!?\"")

    # Check if the last character in the response is punctuation. If it isn't, find the last punctuation
    # and remove everything after it, otherwise do nothing
    if response[-1] in punctuation:
        return response
    else:
        last_punctuation = max(response.rfind(p) for p in punctuation)
        pruned = response[:last_punctuation+1]

    return pruned

response = "What did he say? He said something like, \"You know what's going on in this house?\" And then...he said: \"...you can't do that.\" That's the worst thing about it -- and if they didn’t tell me where we were at all--I would never"
r2 = "That is not true! You're talking about an old friend and he bought it for himself -- in a way that would have been impossible without him... He was very wealthy indeed... And he wanted to get rid Of his own money too - so we made him our partner on all matters pertaining To America"
pruned_response = remove_unfinished_sentences(r2)
print(pruned_response)

That is not true! You're talking about an old friend and he bought it for himself -- in a way that would have been impossible without him... He was very wealthy indeed...


In [185]:
# Response generation
index = 45
sample_prompt = test_data[index]["prompt"]
test_data_completion = test_data[index]["completion"]
#sample_prompt = "<USER> Who are you? <AGENT> 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#gpt2_model = gpt2_model.to(device)

input_ids = loaded_tokenizer.encode(sample_prompt, return_tensors="pt")
context_length = input_ids.shape[1]
print(f"Prompt length: {context_length}")
#input_ids = input_ids.to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
# 0.4-0.6 temp works best
temperatures = [i/10 for i in range(1, 10)]
print(temperatures)
for temp in temperatures:
    print("Responses using temperature ", temp)
    output = loaded_model.generate(
        input_ids,
        attention_mask=attention_mask,
        temperature=temp,
        max_length=context_length+50,
        num_return_sequences=3,
        #num_beams=5,
        repetition_penalty=1.5,
        top_p=0.9,
        do_sample=True
    )

    # generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
    # print(f"Prompt: {sample_prompt}")
    # print(f"Generated Response: {generated_text}")
    print(f"Prompt: {sample_prompt}\n")
    for i, sequence in enumerate(output):
        generated_text = loaded_tokenizer.decode(sequence, skip_special_tokens=False)
        normalized_prompt = ' '.join(sample_prompt.split())
        normalized_text = ' '.join(generated_text.split())
        without_prompt = normalized_text.replace(normalized_prompt, "").strip()
        cleaned_response = remove_unfinished_sentences(re.sub(r'( <USER>).*$', '', without_prompt))
        print(f"Generated Response {i+1}: {cleaned_response}\n")
        if len(cleaned_response) < 2:
            print("RAW: ", generated_text)

    print(f"Original test data completion: {test_data_completion}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt length: 22
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
Responses using temperature  0.1


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: <USER> Did he give any reason? <AGENT> He suggested I ask you. <USER> Ask me? <AGENT>

Generated Response 1: No, sir! You're not asking for anything at all -- just a question of whether or NOT to answer it...

Generated Response 2: No, sir! You're not asking for anything at all -- just a chance to get some sleep and then we'll be back in the morning...

Generated Response 3: No, sir! You're not asking for anything at all -- just a chance to get some sleep and talk about it with someone who's really good-looking... or who has the right attitude...

Original test data completion: Yes. He said, "Ask your wife." I don't know why he said that.
Responses using temperature  0.2


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: <USER> Did he give any reason? <AGENT> He suggested I ask you. <USER> Ask me? <AGENT>

Generated Response 1: No, sir! You're not asking for anything at all -- just a question of time and place... What's your name again...?

Generated Response 2: No, sir! You're not going to answer my question if it's a lie or something... and then tell him that I'm sorry for what happened -- but don't tell the truth about anything at all because of your own feelings...

Generated Response 3: No, sir! You're not going to tell anyone about it -- but if we do find out what happened... then maybe you'll know something for sure...

Original test data completion: Yes. He said, "Ask your wife." I don't know why he said that.
Responses using temperature  0.3


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: <USER> Did he give any reason? <AGENT> He suggested I ask you. <USER> Ask me? <AGENT>

Generated Response 1: No, no...

Generated Response 2: Yes, sir! You're not asking for anything at all -- just a chance to get some sleep and rest from the old man who's been living in his house since we left him here last night...

Generated Response 3: No, sir! You're not asking for anything at all -- just a little bit of advice on how to get outta here and into the world without a gun or something...

Original test data completion: Yes. He said, "Ask your wife." I don't know why he said that.
Responses using temperature  0.4


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: <USER> Did he give any reason? <AGENT> He suggested I ask you. <USER> Ask me? <AGENT>

Generated Response 1: No, sir...I'm not sure of that one! You're a very good friend to him and his family -- but why would anyone want it so bad for them if they found out about my affair with your mother-in law?!

Generated Response 2: No, but we're talking about a guy who's been through some of the most difficult times in his life and is very interested to see what happens next...what if it doesn't work out for him or not at all...?

Generated Response 3: No, no... What are they talking about here?! They're trying to kill us! We can't let them do that -- we've got a lot of work ahead and it's not going well for the kids or anybody else in this town...

Original test data completion: Yes. He said, "Ask your wife." I don't know why he said that.
Responses using temperature  0.5


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: <USER> Did he give any reason? <AGENT> He suggested I ask you. <USER> Ask me? <AGENT>

Generated Response 1: Why do they call him a "pig"?

Generated Response 2: No, sir...I don't know what to say! What do we have here?! A man's life is at stake in this town and...what are the consequences of our actions if they go against him...?

Generated Response 3: Yes, sir -- but why do we have to go through all this again if it's not the best thing for us and our children! We don't want that stuff in here anymore!!

Original test data completion: Yes. He said, "Ask your wife." I don't know why he said that.
Responses using temperature  0.6


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: <USER> Did he give any reason? <AGENT> He suggested I ask you. <USER> Ask me? <AGENT>

Generated Response 1: No, no...just what was your name and address on the package that Mr O'Connell sent for him last night at the Mallory's in New York City...?

Generated Response 2: No, sir! You're a gentleman and that's what I'm asking for... For the sake of your reputation -- why don't we take him to see if there are more than two hundred people at this table?

Generated Response 3: No, not really... just because it's a question of honor to be asked like that -- and for the first time in my life we're not going back on our word as much anymore!

Original test data completion: Yes. He said, "Ask your wife." I don't know why he said that.
Responses using temperature  0.7


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: <USER> Did he give any reason? <AGENT> He suggested I ask you. <USER> Ask me? <AGENT>

Generated Response 1: No, not at all! You can't do this to him alone...I'll get out of here -- but let's go first and see what happens next before it gets too late...

Generated Response 2: Yes, sir... if there was anything else on your mind that would make sense to the Commander-in chief of Starfleet -- not just because it is a personal matter but also so we can work together as an organization and -- oh yes!

Generated Response 3: Yes, but no one else was ever asked to do this thing before -- except maybe your father-inlaw and his brother who tried it on the other side of town last week... They're dead now!

Original test data completion: Yes. He said, "Ask your wife." I don't know why he said that.
Responses using temperature  0.8


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: <USER> Did he give any reason? <AGENT> He suggested I ask you. <USER> Ask me? <AGENT>

Generated Response 1: Why did they call him "Joe"?

Generated Response 2: Yes, sir... No, no!

Generated Response 3: No, sir -- there was no way out! It's a very dangerous game to be in on...

Original test data completion: Yes. He said, "Ask your wife." I don't know why he said that.
Responses using temperature  0.9
Prompt: <USER> Did he give any reason? <AGENT> He suggested I ask you. <USER> Ask me? <AGENT>

Generated Response 1: It's a very personal thing for him -- to get into the past and be angry when you see it happening is quite upsetting, so...

Generated Response 2: No, sir... Why don't we come and get him right away at the clinic or somewhere else where there's something to talk about before it happens again --? Tell us what happened here in this house when they asked for money...?

Generated Response 3: No, no... What's wrong with asking him where his money is from again?! You kno

In [83]:
t = "Yeah. <USER> How much do you want to pay me for this, Mr Lombardo? <AGENT> I don't know...I'm not sure what it is that's bothering us so far here but we're going in the right direction and if there are any problems"
t2 = "<USER> Can't you see?  She just wanted her little girl back. <AGENT> Who wanted her little girl back? <USER> The drowning woman.  Anne... But it was too late.  Her little girl was already gone. <AGENT> She died? <USER> She grew up. <AGENT>"
t3 = "<USER>  Can't you see?  She just wanted her little girl back.  <AGENT>  Who wanted her little girl back?  <USER>  The drowning woman.  Anne... But it was too late.  Her little girl was already gone.  <AGENT>  She died?  <USER>  She grew up.  <AGENT>  And what did she do to deserve that, anyway...?  <USER>  I don’t know! It's all a lie -- but there is something about the way things are going with this town and"
t3_normalized = ' '.join(t3.split())
print(t2)
t2_normalized = ' '.join(t2.split())
print(t2_normalized)
print(t3_normalized)
without_prompt = t3_normalized.replace(t2_normalized, "").strip()
print(without_prompt)
cleaned_response = re.sub(r'( <USER>).*$', '', without_prompt)
print(cleaned_response)

<USER> Can't you see?  She just wanted her little girl back. <AGENT> Who wanted her little girl back? <USER> The drowning woman.  Anne... But it was too late.  Her little girl was already gone. <AGENT> She died? <USER> She grew up. <AGENT>
<USER> Can't you see? She just wanted her little girl back. <AGENT> Who wanted her little girl back? <USER> The drowning woman. Anne... But it was too late. Her little girl was already gone. <AGENT> She died? <USER> She grew up. <AGENT>
<USER> Can't you see? She just wanted her little girl back. <AGENT> Who wanted her little girl back? <USER> The drowning woman. Anne... But it was too late. Her little girl was already gone. <AGENT> She died? <USER> She grew up. <AGENT> And what did she do to deserve that, anyway...? <USER> I don’t know! It's all a lie -- but there is something about the way things are going with this town and
And what did she do to deserve that, anyway...? <USER> I don’t know! It's all a lie -- but there is something about the way th

In [44]:
def generate(input, device):
    # Generate a response
    input_ids = loaded_tokenizer.encode(input, return_tensors="pt")
    #input_ids.to(device)
    #gpt2_model.to(device)
    output = loaded_model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=0.3)
    generated_text = loaded_tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

In [45]:
print(generate("<USER> Tell me a story.", device='cuda'))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 Tell me a story.  I'm not sure.  Tell me a story.  I'm not sure.  Tell me a story.  I'm not sure.  Tell me a story.  I'm not sure.  Tell me a story.  I'm not sure.  Tell me a story.  I'm not sure.  Tell me a story.  I'm not sure.  Tell


In [49]:
# generator = pipeline('text-generation', model='gpt2-medium')
# set_seed(42)

generator_fine_tuned = pipeline('text-generation', model=loaded_model, tokenizer=loaded_tokenizer)

In [None]:
print(generator("<USER> Hi, how are you? <AGENT>", max_length=20, num_return_sequences=5))


In [56]:
generator_fine_tuned("<USER> Why is the sky blue? <AGENT>", max_length=50, num_return_sequences=5, temperature=0.8)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "<USER> Why is the sky blue? <AGENT> It's because there is something very urgent between us...  What?  Something urgent. I've been thinking over a lot of things.  You know.  Yes. "},
 {'generated_text': "<USER> Why is the sky blue? <AGENT> I don't know.  You said we had to come to your house, didn't you?  I don't know.  I'm serious, man.  You're not crazy"},
 {'generated_text': "<USER> Why is the sky blue? <AGENT> Your eyes are blue.  How'd you die...?  I didn't die.  Well, there were some things...  Yeah?  You knew something about death...!"},
 {'generated_text': "<USER> Why is the sky blue? <AGENT> Because of what we're looking at.  How is it that the sky is blue?  Because of how deep you go.  How deep?  So deep.  And"},
 {'generated_text': "<USER> Why is the sky blue? <AGENT> It's not blue -- it's a brownish brown that happens to be blue to me.  Then why is it so blue?  Because there's something else.  Something? "}]