## Setup

In [105]:
import os
import chardet
import re
import ast
import pickle
import itertools
import pandas as pd
import numpy as np
import evaluate
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from evaluate import load
from transformers import pipeline, set_seed

pd.set_option("display.max_colwidth", None)

In [82]:
DATASET_FILENAME = "movie_conversations.pkl"
MAX_TOKEN_LENGTH = 1024
dataset_generated = False

if os.path.exists(DATASET_FILENAME):
    dataset_generated = True

## Preprocessing

In [83]:
if not dataset_generated:
    # Detect encoding of the movie lines file
    cwd = os.getcwd()

    # with open(cwd + "/movie_lines/movie_lines.txt", "rb") as f:
    #     result = chardet.detect(f.read())
    #     movie_lines_encoding = result["encoding"]

    # with open(cwd + "/movie_lines/movie_conversations.txt", "rb") as f:
    #     result = chardet.detect(f.read())
    #     movie_conversations_encoding = result["encoding"]

    # print(movie_lines_encoding)
    # print(movie_conversations_encoding)
    movie_lines_encoding = "Windows-1252"
    movie_conversations_encoding = "ascii"

In [84]:
if not dataset_generated:
    # Collect individual movie lines
    with open(cwd + "/movie_lines/movie_lines.txt", "r", encoding=movie_lines_encoding) as f:
        content = f.read()

    lines = content.split("\n")

    # Print first 5 lines and verify length is correct
    print(lines[:5])

    # Remove last element of lines because its an empty string
    lines = lines[:-1]
    print(len(lines))

In [85]:
if not dataset_generated:
    # Initialize containers for values to put in dataframe
    line_numbers_dict = {}

    for line in lines:
        # Split on whitespace
        split = line.split(" ")
        line_number = split[0]
        character_id = split[2]

        # Extract the text after the last "+" character
        l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]
        line_numbers_dict[line_number] = (character_id, l)


    # Create dataframe from extracted values
    print(dict(itertools.islice(line_numbers_dict.items(), 10)))

In [86]:
if not dataset_generated:
    # Collect movie conversation lists
    with open(cwd + "/movie_lines/movie_conversations.txt", "r", encoding=movie_conversations_encoding) as f:
        content = f.read()

    lines = content.split("\n")

    # Print first 5 lines and verify length is correct
    print(lines[:5])

    # Remove last element of lines because its an empty string
    lines = lines[:-1]
    print(len(lines))

In [87]:
if not dataset_generated:
    # Initialize containers for values to put in dataframe
    speaker1_ids = []
    speaker2_ids = []
    conversation_lines = []

    for line in lines:
        # Split on whitespace
        split = line.split(" ")
        speaker1_ids.append(split[0])
        speaker2_ids.append(split[2])

        # Extract the text after the last "+" character
        l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]
        l = ast.literal_eval(l)
        conversation_lines.append(l)


    # Create dataframe from extracted values
    movie_conversations = pd.DataFrame(list(zip(speaker1_ids, speaker2_ids, conversation_lines)), columns=["speaker1_id", "speaker2_id", "conversation_lines"])
    display(movie_conversations.head())
    movie_conversations.info()

In [88]:
# Function for turning movie lines into multi-turn conversations for training
def create_conversation_turns(row):
    speaker1 = row["speaker1_id"]
    conversation_list = row["conversation_lines"]
    convo = ""
    # For each line, add it to the conversation with a role label
    for i, line_id in enumerate(conversation_list):
        movie_line = line_numbers_dict[line_id]
        text = movie_line[1]

        # Ensure the conversations end with agent response
        if i == len(conversation_list) - 1 and i % 2 == 0:
            continue

        if i%2 == 0:
            convo += f"<USER>: {text} \n"
        else:
            convo += f"<AGENT>: {text} \n"

        # if movie_line[0] == speaker1:
        #     convo += f"<USER>: {text} \n"
        # else:
        #     convo += f"<AGENT>: {text} \n"
        
    return convo

In [89]:
if not dataset_generated:
    # Try it on a sample for testing
    sample = movie_conversations.copy().iloc[:2]
    sample["conversation"] = sample.apply(create_conversation_turns, axis=1)
    display(sample)

    # Call create_conversation_turns on every row of the dataframe
    movie_conversations["conversation"] = movie_conversations.apply(create_conversation_turns, axis=1)

    # Drop speaker id columns because they aren't needed anymore
    movie_conversations.drop(columns=["speaker1_id", "speaker2_id"], inplace=True)

In [90]:
if not dataset_generated:
    display(movie_conversations.head())
    print(movie_conversations["conversation"].head(1).values[0])

In [91]:
if not dataset_generated:
    # Get the token count for each conversation so we can split the ones that are too long
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
    movie_conversations["token_count"] = movie_conversations["conversation"].apply(lambda x: len(tokenizer.encode(x)))
    movie_conversations.head()

In [92]:
if not dataset_generated:    
    # Get the conversation entries that are over the token limit
    over_token_limit = movie_conversations[movie_conversations["token_count"] > MAX_TOKEN_LENGTH].index
    print(over_token_limit)

In [93]:
if not dataset_generated:
    # NOTE: Be careful not to run this more than once without reseting the dataframe
    new_entries = []

    # For each conversation that is too long, split it in half
    for idx in over_token_limit:
        lines_to_split = movie_conversations.iloc[idx]["conversation_lines"]
        split_idx = (len(lines_to_split)//2)
        first_half = lines_to_split[:split_idx]
        second_half = lines_to_split[split_idx:]

        first_convo = ""
        second_convo = ""
        # For each line, add it to the conversation
        for i, line_id in enumerate(first_half):
            movie_line = line_numbers_dict[line_id]
            text = movie_line[1]

            # Ensure the conversations end with agent response
            if i == len(first_half) - 1 and i % 2 == 0:
                continue

            if i%2 == 0:
                first_convo += f"<USER>: {text} \n"
            else:
                first_convo += f"<AGENT>: {text} \n"

        new_entries.append({"conversation_lines": first_half, "conversation": first_convo})

        for i, line_id in enumerate(second_half):
            movie_line = line_numbers_dict[line_id]
            text = movie_line[1]

            # Ensure the conversations end with agent response
            if i == len(second_half) - 1 and i % 2 == 0:
                continue

            if i%2 == 0:
                second_convo += f"<USER>: {text} \n"
            else:
                second_convo += f"<AGENT>: {text} \n"

        new_entries.append({"conversation_lines": second_half, "conversation": second_convo})

    # Add the new entries from splitting and drop the originals
    movie_conversations = pd.concat([movie_conversations, pd.DataFrame(new_entries)], axis=0, ignore_index=True)
    movie_conversations.drop(index=over_token_limit, inplace=True)

    movie_conversations.reset_index(inplace=True, drop=True)
    movie_conversations.info()

In [94]:
if not dataset_generated:
    # Check that no conversations are over the token limit
    display(movie_conversations[movie_conversations["token_count"] > MAX_TOKEN_LENGTH])

In [97]:
# Drop line numbers and token count because they aren't needed anymore
if not dataset_generated:
    movie_conversations.drop(columns=["conversation_lines", "token_count"], inplace=True)

In [98]:
# Serialize dataset for faster loading
if not dataset_generated:
    with open(DATASET_FILENAME, "wb") as f:
        print("Writing dataframe to file")
        pickle.dump(movie_conversations, f)
else:
    with open(DATASET_FILENAME, "rb") as f:
        print("Loading dataframe from file")
        movie_conversations = pickle.load(f)

display(movie_conversations.head())
print(movie_conversations["conversation"].head(1).values[0])

Loading dataframe from file


Unnamed: 0,conversation
0,"<USER>: Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again. \n<AGENT>: Well, I thought we'd start with pronunciation, if that's okay with you. \n<USER>: Not the hacking and gagging and spitting part. Please. \n<AGENT>: Okay... then how 'bout we try out some French cuisine. Saturday? Night? \n"
1,<USER>: You're asking me out. That's so cute. What's your name again? \n<AGENT>: Forget it. \n
2,"<USER>: No, no, it's my fault -- we didn't have a proper introduction --- \n<AGENT>: Cameron. \n<USER>: The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does. \n<AGENT>: Seems like she could get a date easy enough... \n"
3,"<USER>: Why? \n<AGENT>: Unsolved mystery. She used to be really popular when she started high school, then it was just like she got sick of it or something. \n"
4,"<USER>: Gosh, if only we could find Kat a boyfriend... \n<AGENT>: Let me see what I can do. \n"


<USER>: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. 
<AGENT>: Well, I thought we'd start with pronunciation, if that's okay with you. 
<USER>: Not the hacking and gagging and spitting part.  Please. 
<AGENT>: Okay... then how 'bout we try out some French cuisine.  Saturday?  Night? 



In [107]:
# Test splitting on last agent line
test_string = movie_conversations["conversation"].head(1).values[0]
print(test_string)
splits = test_string.split('<AGENT>:')
response = splits[-1].strip()
input = '<AGENT>:'.join(splits[:-1]).strip() + "<AGENT>:"
print("Input:\n", input)
print("Response:\n", response)
for split in splits:
    print(split)


<USER>: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. 
<AGENT>: Well, I thought we'd start with pronunciation, if that's okay with you. 
<USER>: Not the hacking and gagging and spitting part.  Please. 
<AGENT>: Okay... then how 'bout we try out some French cuisine.  Saturday?  Night? 

Input:
 <USER>: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. 
<AGENT>: Well, I thought we'd start with pronunciation, if that's okay with you. 
<USER>: Not the hacking and gagging and spitting part.  Please.<AGENT>:
Response:
 Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?
<USER>: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. 

 Well, I thought we'd start with pronunciation, if that's okay with you. 
<USER>: Not the h

In [108]:
# For the test and evaluation sets, cut off the last agent response to create expected output to compare against for eval
def create_response(text):
    splits = text.split('<AGENT>:')
    input = '<AGENT>:'.join(splits[:-1]).strip() + "<AGENT>:"
    response = splits[-1].strip()

    return pd.Series([input, response])

movie_conversations[["context", "response"]] = movie_conversations["conversation"].apply(create_response)
movie_conversations.head()


Unnamed: 0,conversation,context,response
0,"<USER>: Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again. \n<AGENT>: Well, I thought we'd start with pronunciation, if that's okay with you. \n<USER>: Not the hacking and gagging and spitting part. Please. \n<AGENT>: Okay... then how 'bout we try out some French cuisine. Saturday? Night? \n","<USER>: Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again. \n<AGENT>: Well, I thought we'd start with pronunciation, if that's okay with you. \n<USER>: Not the hacking and gagging and spitting part. Please.<AGENT>:",Okay... then how 'bout we try out some French cuisine. Saturday? Night?
1,<USER>: You're asking me out. That's so cute. What's your name again? \n<AGENT>: Forget it. \n,<USER>: You're asking me out. That's so cute. What's your name again?<AGENT>:,Forget it.
2,"<USER>: No, no, it's my fault -- we didn't have a proper introduction --- \n<AGENT>: Cameron. \n<USER>: The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does. \n<AGENT>: Seems like she could get a date easy enough... \n","<USER>: No, no, it's my fault -- we didn't have a proper introduction --- \n<AGENT>: Cameron. \n<USER>: The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does.<AGENT>:",Seems like she could get a date easy enough...
3,"<USER>: Why? \n<AGENT>: Unsolved mystery. She used to be really popular when she started high school, then it was just like she got sick of it or something. \n",<USER>: Why?<AGENT>:,"Unsolved mystery. She used to be really popular when she started high school, then it was just like she got sick of it or something."
4,"<USER>: Gosh, if only we could find Kat a boyfriend... \n<AGENT>: Let me see what I can do. \n","<USER>: Gosh, if only we could find Kat a boyfriend...<AGENT>:",Let me see what I can do.


In [109]:
# Split dataset into test, evaluation, and training sets
train_data, temp_data = train_test_split(movie_conversations, train_size=0.6, random_state=42)
print(train_data.info())

eval_data, test_data = train_test_split(temp_data, train_size=0.5, random_state=42)
print(eval_data.info())
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 49863 entries, 30425 to 15795
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   conversation  49863 non-null  object
 1   context       49863 non-null  object
 2   response      49863 non-null  object
dtypes: object(3)
memory usage: 1.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 16621 entries, 80426 to 70559
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   conversation  16621 non-null  object
 1   context       16621 non-null  object
 2   response      16621 non-null  object
dtypes: object(3)
memory usage: 519.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 16621 entries, 39297 to 24595
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   conversation  16621 non-null  object
 1   context       16621 non-null  object

In [110]:
# Convert dataframes to datasets
train_dataset = Dataset.from_pandas(train_data, preserve_index=False)
eval_dataset = Dataset.from_pandas(eval_data, preserve_index=False)
test_dataset = Dataset.from_pandas(test_data, preserve_index=False)

train_dataset

Dataset({
    features: ['conversation', 'context', 'response'],
    num_rows: 49863
})

## Fine-tuning

## Building the chatbot

In [43]:

generator = pipeline('text-generation', model='gpt2-medium')
set_seed(42)


In [44]:
generator("User: Hi, how are you? Agent:", max_length=20, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "User: Hi, how are you? Agent: Ok, I guess it's time to show myself"},
 {'generated_text': "User: Hi, how are you? Agent: What's going on?! Player: Ok, I"},
 {'generated_text': 'User: Hi, how are you? Agent: Hi! How would you like to play? What'},
 {'generated_text': "User: Hi, how are you? Agent: I'm fine... Agent: What do you need"},
 {'generated_text': "User: Hi, how are you? Agent: Good, what happens is, I'm a person"}]