In [43]:
import os
import chardet
import re
import ast
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
DATASET_FILENAME = "movie_conversations.pkl"
dataset_generated = False

if os.path.exists(DATASET_FILENAME):
    dataset_generated = True

In [42]:
if not dataset_generated:
    # Detect encoding of the movie lines file
    cwd = os.getcwd()

    with open(cwd + "/movie_lines/movie_lines.txt", "rb") as f:
        result = chardet.detect(f.read())
        movie_lines_encoding = result["encoding"]

    with open(cwd + "/movie_lines/movie_conversations.txt", "rb") as f:
        result = chardet.detect(f.read())
        movie_conversations_encoding = result["encoding"]

    print(movie_lines_encoding)
    print(movie_conversations_encoding)

Windows-1252
ascii


In [44]:
if not dataset_generated:
    # Collect individual movie lines
    with open(cwd + "/movie_lines/movie_lines.txt", "r", encoding=movie_lines_encoding) as f:
        content = f.read()

    lines = content.split("\n")

    # Print first 5 lines and verify length is correct
    print(lines[:5])

    # Remove last element of lines because its an empty string
    lines = lines[:-1]
    print(len(lines))


['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!', 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!', 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.', 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?', "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go."]
304713


In [45]:
if not dataset_generated:
    # Initialize containers for values to put in dataframe
    line_numbers = []
    character_ids = []
    character_lines = []

    for line in lines:
        # Split on whitespace
        split = line.split(" ")
        line_numbers.append(split[0])
        character_ids.append(split[2])

        # Extract the text after the last "+" character
        l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]
        character_lines.append(l)


    # Create dataframe from extracted values
    movie_lines = pd.DataFrame(list(zip(line_numbers, character_ids, character_lines)), columns=["line_id", "character_id", "line"])
    display(movie_lines.head(10))
    movie_lines.info()

Unnamed: 0,line_id,character_id,line
0,L1045,u0,They do not!
1,L1044,u2,They do to!
2,L985,u0,I hope so.
3,L984,u2,She okay?
4,L925,u0,Let's go.
5,L924,u2,Wow
6,L872,u0,Okay -- you're gonna need to learn how to lie.
7,L871,u2,No
8,L870,u0,I'm kidding. You know how sometimes you just ...
9,L869,u0,Like my fear of wearing pastels?


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304713 entries, 0 to 304712
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   line_id       304713 non-null  object
 1   character_id  304713 non-null  object
 2   line          304713 non-null  object
dtypes: object(3)
memory usage: 7.0+ MB


In [46]:
if not dataset_generated:
    # Collect movie conversation lists
    with open(cwd + "/movie_lines/movie_conversations.txt", "r", encoding=movie_conversations_encoding) as f:
        content = f.read()

    lines = content.split("\n")

    # Print first 5 lines and verify length is correct
    print(lines[:5])

    # Remove last element of lines because its an empty string
    lines = lines[:-1]
    print(len(lines))

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']"]
83097


In [47]:
if not dataset_generated:
    # Initialize containers for values to put in dataframe
    speaker1_ids = []
    speaker2_ids = []
    conversation_lines = []

    for line in lines:
        # Split on whitespace
        split = line.split(" ")
        speaker1_ids.append(split[0])
        speaker2_ids.append(split[2])

        # Extract the text after the last "+" character
        l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]
        l = ast.literal_eval(l)
        conversation_lines.append(l)


    # Create dataframe from extracted values
    movie_conversations = pd.DataFrame(list(zip(speaker1_ids, speaker2_ids, conversation_lines)), columns=["speaker1_id", "speaker2_id", "conversation_lines"])
    display(movie_conversations.head())
    movie_conversations.info()

Unnamed: 0,speaker1_id,speaker2_id,conversation_lines
0,u0,u2,"[L194, L195, L196, L197]"
1,u0,u2,"[L198, L199]"
2,u0,u2,"[L200, L201, L202, L203]"
3,u0,u2,"[L204, L205, L206]"
4,u0,u2,"[L207, L208]"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83097 entries, 0 to 83096
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   speaker1_id         83097 non-null  object
 1   speaker2_id         83097 non-null  object
 2   conversation_lines  83097 non-null  object
dtypes: object(3)
memory usage: 1.9+ MB


In [48]:
# Function for turning movie lines into multi-turn conversations for training
def create_conversation_turns(row):
    speaker1 = row["speaker1_id"]
    conversation_list = row["conversation_lines"]
    convo = ""
    # For each line in this conversation, add it to this
    for line_id in conversation_list:
        movie_line = movie_lines[movie_lines["line_id"] == line_id]
        text = movie_line["line"].values[0]

        if movie_line["character_id"].values[0] == speaker1:
            convo += f"<USER>: {text} \n"
        else:
            convo += f"<AGENT>: {text} \n"
        
    return convo

In [51]:
if not dataset_generated:
    # Try it on a sample for testing
    # sample = movie_conversations.iloc[:2]
    # sample["conversation"] = sample.apply(create_conversation_turns, axis=1)
    # display(sample)

    # Call create_conversation_turns on every row of the dataframe
    movie_conversations["conversation"] = movie_conversations.parallel_apply(create_conversation_turns, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10388), Label(value='0 / 10388')))…

In [None]:
if not dataset_generated:
    with open(DATASET_FILENAME, "wb") as f:
        pickle.dump(movie_conversations, f)
else:
    with open(DATASET_FILENAME, "rb") as f:
        pickle.load(movie_conversations, f)