In [82]:
import os
import chardet
import re
import ast
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer
from pandarallel import pandarallel

pd.set_option("display.max_colwidth", None)
pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [83]:
DATASET_FILENAME = "movie_conversations.pkl"
dataset_generated = False

if os.path.exists(DATASET_FILENAME):
    dataset_generated = True

In [84]:
if not dataset_generated:
    # Detect encoding of the movie lines file
    cwd = os.getcwd()

    with open(cwd + "/movie_lines/movie_lines.txt", "rb") as f:
        result = chardet.detect(f.read())
        movie_lines_encoding = result["encoding"]

    with open(cwd + "/movie_lines/movie_conversations.txt", "rb") as f:
        result = chardet.detect(f.read())
        movie_conversations_encoding = result["encoding"]

    print(movie_lines_encoding)
    print(movie_conversations_encoding)

In [85]:
if not dataset_generated:
    # Collect individual movie lines
    with open(cwd + "/movie_lines/movie_lines.txt", "r", encoding=movie_lines_encoding) as f:
        content = f.read()

    lines = content.split("\n")

    # Print first 5 lines and verify length is correct
    print(lines[:5])

    # Remove last element of lines because its an empty string
    lines = lines[:-1]
    print(len(lines))


In [86]:
if not dataset_generated:
    # Initialize containers for values to put in dataframe
    line_numbers = []
    character_ids = []
    character_lines = []

    for line in lines:
        # Split on whitespace
        split = line.split(" ")
        line_numbers.append(split[0])
        character_ids.append(split[2])

        # Extract the text after the last "+" character
        l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]
        character_lines.append(l)


    # Create dataframe from extracted values
    movie_lines = pd.DataFrame(list(zip(line_numbers, character_ids, character_lines)), columns=["line_id", "character_id", "line"])
    display(movie_lines.head(10))
    movie_lines.info()

In [87]:
if not dataset_generated:
    # Collect movie conversation lists
    with open(cwd + "/movie_lines/movie_conversations.txt", "r", encoding=movie_conversations_encoding) as f:
        content = f.read()

    lines = content.split("\n")

    # Print first 5 lines and verify length is correct
    print(lines[:5])

    # Remove last element of lines because its an empty string
    lines = lines[:-1]
    print(len(lines))

In [88]:
if not dataset_generated:
    # Initialize containers for values to put in dataframe
    speaker1_ids = []
    speaker2_ids = []
    conversation_lines = []

    for line in lines:
        # Split on whitespace
        split = line.split(" ")
        speaker1_ids.append(split[0])
        speaker2_ids.append(split[2])

        # Extract the text after the last "+" character
        l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]
        l = ast.literal_eval(l)
        conversation_lines.append(l)


    # Create dataframe from extracted values
    movie_conversations = pd.DataFrame(list(zip(speaker1_ids, speaker2_ids, conversation_lines)), columns=["speaker1_id", "speaker2_id", "conversation_lines"])
    display(movie_conversations.head())
    movie_conversations.info()

In [89]:
# Function for turning movie lines into multi-turn conversations for training
def create_conversation_turns(row):
    speaker1 = row["speaker1_id"]
    conversation_list = row["conversation_lines"]
    convo = ""
    # For each line in this conversation, add it to this
    for line_id in conversation_list:
        movie_line = movie_lines[movie_lines["line_id"] == line_id]
        text = movie_line["line"].values[0]

        if movie_line["character_id"].values[0] == speaker1:
            convo += f"<USER>: {text} \n"
        else:
            convo += f"<AGENT>: {text} \n"
        
    return convo

In [90]:
if not dataset_generated:
    # Try it on a sample for testing
    # sample = movie_conversations.iloc[:2]
    # sample["conversation"] = sample.apply(create_conversation_turns, axis=1)
    # display(sample)

    # Call create_conversation_turns on every row of the dataframe
    movie_conversations["conversation"] = movie_conversations.parallel_apply(create_conversation_turns, axis=1)

In [91]:
if not dataset_generated:
    with open(DATASET_FILENAME, "wb") as f:
        pickle.dump(movie_conversations, f)
else:
    with open(DATASET_FILENAME, "rb") as f:
        movie_conversations = pickle.load(f)

In [92]:
display(movie_conversations.head())
print(movie_conversations["conversation"].head(1).values[0])

Unnamed: 0,speaker1_id,speaker2_id,conversation_lines,conversation
0,u0,u2,"[L194, L195, L196, L197]","<USER>: Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again. \n<AGENT>: Well, I thought we'd start with pronunciation, if that's okay with you. \n<USER>: Not the hacking and gagging and spitting part. Please. \n<AGENT>: Okay... then how 'bout we try out some French cuisine. Saturday? Night? \n"
1,u0,u2,"[L198, L199]",<USER>: You're asking me out. That's so cute. What's your name again? \n<AGENT>: Forget it. \n
2,u0,u2,"[L200, L201, L202, L203]","<USER>: No, no, it's my fault -- we didn't have a proper introduction --- \n<AGENT>: Cameron. \n<USER>: The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser. My sister. I can't date until she does. \n<AGENT>: Seems like she could get a date easy enough... \n"
3,u0,u2,"[L204, L205, L206]","<AGENT>: Why? \n<USER>: Unsolved mystery. She used to be really popular when she started high school, then it was just like she got sick of it or something. \n<AGENT>: That's a shame. \n"
4,u0,u2,"[L207, L208]","<USER>: Gosh, if only we could find Kat a boyfriend... \n<AGENT>: Let me see what I can do. \n"


<USER>: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. 
<AGENT>: Well, I thought we'd start with pronunciation, if that's okay with you. 
<USER>: Not the hacking and gagging and spitting part.  Please. 
<AGENT>: Okay... then how 'bout we try out some French cuisine.  Saturday?  Night? 

