In [9]:
import os
import chardet
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Detect encoding of the movie lines file
cwd = os.getcwd()

with open(cwd + "/movie_lines/movie_lines.txt", "rb") as f:
    result = chardet.detect(f.read())
    movie_lines_encoding = result["encoding"]

with open(cwd + "/movie_lines/movie_conversations.txt", "rb") as f:
    result = chardet.detect(f.read())
    movie_conversations_encoding = result["encoding"]

print(movie_lines_encoding)
print(movie_conversations_encoding)

Windows-1252
ascii


In [5]:
# Collect individual movie lines
with open(cwd + "/movie_lines/movie_lines.txt", "r", encoding=movie_lines_encoding) as f:
    content = f.read()

lines = content.split("\n")

# Print first 5 lines and verify length is correct
print(lines[:5])

# Remove last element of lines because its an empty string
lines = lines[:-1]
print(len(lines))


['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!', 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!', 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.', 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?', "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go."]
304713


In [6]:
# Initialize containers for values to put in dataframe
line_numbers = []
character_ids = []
character_names = []
character_lines = []

for line in lines:
    # Split on whitespace
    split = line.split(" ")
    line_numbers.append(split[0])
    character_ids.append(split[2])
    character_names.append(split[6])

    # Extract the text after the last "+" character
    l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]
    character_lines.append(l)


# Create dataframe from extracted values
movie_lines = pd.DataFrame(list(zip(line_numbers, character_ids, character_names, character_lines)), columns=["line_id", "character_id", "character_name", "line"])
display(movie_lines.head(10))
movie_lines.info()

Unnamed: 0,line_id,character_id,character_name,line
0,L1045,u0,BIANCA,They do not!
1,L1044,u2,CAMERON,They do to!
2,L985,u0,BIANCA,I hope so.
3,L984,u2,CAMERON,She okay?
4,L925,u0,BIANCA,Let's go.
5,L924,u2,CAMERON,Wow
6,L872,u0,BIANCA,Okay -- you're gonna need to learn how to lie.
7,L871,u2,CAMERON,No
8,L870,u0,BIANCA,I'm kidding. You know how sometimes you just ...
9,L869,u0,BIANCA,Like my fear of wearing pastels?


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304713 entries, 0 to 304712
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   line_id         304713 non-null  object
 1   character_id    304713 non-null  object
 2   character_name  304713 non-null  object
 3   line            304713 non-null  object
dtypes: object(4)
memory usage: 9.3+ MB


In [7]:
# Collect movie conversation lists
with open(cwd + "/movie_lines/movie_conversations.txt", "r", encoding=movie_conversations_encoding) as f:
    content = f.read()

lines = content.split("\n")

# Print first 5 lines and verify length is correct
print(lines[:5])

# Remove last element of lines because its an empty string
lines = lines[:-1]
print(len(lines))

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']", "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']"]
83097


In [8]:
# Initialize containers for values to put in dataframe
speaker1_ids = []
speaker2_ids = []
conversation_lines = []

l = re.split(r'\+\s+(?=[^+]*$)', lines[0])[-1]
print(l)

for line in lines:
    # Split on whitespace
    split = line.split(" ")
    speaker1_ids.append(split[0])
    speaker2_ids.append(split[2])

    # Extract the text after the last "+" character
    l = re.split(r'\+\s+(?=[^+]*$)', line)[-1]
    conversation_lines.append(l)


# Create dataframe from extracted values
movie_conversations = pd.DataFrame(list(zip(speaker1_ids, speaker2_ids, conversation_lines)), columns=["speaker1_id", "speaker2_id", "conversation_lines"])
display(movie_conversations.head())
movie_conversations.info()

['L194', 'L195', 'L196', 'L197']


Unnamed: 0,speaker1_id,speaker2_id,conversation_lines
0,u0,u2,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,"['L198', 'L199']"
2,u0,u2,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,"['L204', 'L205', 'L206']"
4,u0,u2,"['L207', 'L208']"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83097 entries, 0 to 83096
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   speaker1_id         83097 non-null  object
 1   speaker2_id         83097 non-null  object
 2   conversation_lines  83097 non-null  object
dtypes: object(3)
memory usage: 1.9+ MB
