In [1]:
import re

def normalize_line(line):
    line = re.sub(r"[\"-]+", r" ", line)
    line = re.sub(r"[ ]+", r" ", line)
    return line

movie_lines = ""
with open("./movie_lines.tsv", "r") as file_in:
    for movie_line in file_in:
        movie_line = normalize_line(movie_line)
        movie_lines += movie_line

In [2]:
movie_lines[:100]

'L1045\tu0\tm0\tBIANCA\tThey do not!\nL1044\tu2\tm0\tCAMERON\tThey do to!\nL985\tu0\tm0\tBIANCA\tI hope so.\nL984\tu2'

In [3]:
import pandas as pd
from io import StringIO
frame = None
frame = pd.read_csv(
    StringIO(movie_lines),
    sep = "\t",
    header = None,
    parse_dates = False,
    error_bad_lines = False,
    warn_bad_lines = False)

In [4]:
frame

Unnamed: 0,0,1,2,3,4
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.
5,L924,u2,m0,CAMERON,Wow
6,L872,u0,m0,BIANCA,Okay you're gonna need to learn how to lie.
7,L871,u2,m0,CAMERON,No
8,L870,u0,m0,BIANCA,I'm kidding. You know how sometimes you just b...
9,L869,u0,m0,BIANCA,Like my fear of wearing pastels?


In [5]:
from nltk import sent_tokenize
import unidecode

preprocessed_sentences = []

def preprocess_sentence(sentence):
    sentence = unidecode.unidecode(sentence)
    sentence = re.sub(r"[-+]?(\d+([.,]\d*)?|[.,]\d+)([eE][-+]?\d+)?", r"#", sentence)
    sentence = re.sub(r"<[/]?[a-z0-9]+>", r"", sentence)
    sentence = re.sub(r"&quot;", r"", sentence)
    sentence = re.sub(r"&amp;", r"", sentence)
    sentence = re.sub(r"&lt;", r"", sentence)
    sentence = re.sub(r"&gt;", r"", sentence)
    sentence = re.sub(r"&emdash;", r"", sentence)
    sentence = re.sub(r"`", r"'", sentence)
    sentence = re.sub(r"[ ]+['][ ]+", r"'", sentence)
    sentence = re.sub(r"[ ]+['](?=[a-zA-Z ])", r" ", sentence)
    sentence = re.sub(r"(?<=[a-zA-Z])['][ ]+", r" ", sentence)
    sentence = re.sub(r"[\.]{2,}", r" ", sentence)
    sentence = re.sub(r"[$%&=|~<>/_\^\[\]{}():;,+*!?]+", r" ", sentence)
    sentence = re.sub(r"[ ]+", r" ", sentence)
    sentence = sentence.strip()
    sentence = re.sub(r"(?<=[a-zA-Z])[']$", r"", sentence)
    sentence = re.sub(r"^['](?=[a-zA-Z])", r"", sentence)
    sentence = re.sub(r"[\.][']$", r"", sentence)
    sentence = re.sub(r"['][\.]$", r"", sentence)
    sentence = re.sub(r"^[ ]", r"", sentence)
    sentence = re.sub(r"[ ]$", r"", sentence)
    sentence = re.sub(r"[\.]$", r"", sentence)
    sentence = sentence.strip()
    return sentence
            
def preprocess_and_append_line(line):
    sentences = sent_tokenize(line)
    for sentence in sentences:
        sentence = sentence.strip()
        sentence = preprocess_sentence(sentence)
        if (sentence != ""):
            preprocessed_sentences.append(sentence)
    
frame[~pd.isnull(frame[4])][4].apply(preprocess_and_append_line)

0         None
1         None
2         None
3         None
4         None
5         None
6         None
7         None
8         None
9         None
10        None
11        None
12        None
13        None
14        None
15        None
16        None
17        None
18        None
19        None
20        None
21        None
22        None
23        None
24        None
25        None
26        None
27        None
28        None
29        None
          ... 
304513    None
304514    None
304515    None
304516    None
304517    None
304518    None
304519    None
304520    None
304521    None
304522    None
304523    None
304524    None
304525    None
304526    None
304527    None
304528    None
304529    None
304530    None
304531    None
304532    None
304533    None
304534    None
304535    None
304536    None
304537    None
304538    None
304539    None
304540    None
304541    None
304542    None
Name: 4, Length: 304286, dtype: object

In [6]:
chars = set()
for s in preprocessed_sentences:
    for c in s:
        chars.add(c)

chars

{' ',
 '#',
 "'",
 '.',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [7]:
preprocessed_sentences[:500]

['They do not',
 'They do to',
 'I hope so',
 'She okay',
 "Let's go",
 'Wow',
 "Okay you're gonna need to learn how to lie",
 'No',
 "I'm kidding",
 'You know how sometimes you just become this persona',
 "And you don't know how to quit",
 'Like my fear of wearing pastels',
 'The real you',
 'What good stuff',
 "I figured you'd get to the good stuff eventually",
 'Thank God',
 'If I had to hear one more story about your coiffure',
 'Me',
 'This endless blonde babble',
 "I'm like boring myself",
 'What crap',
 'do you listen to this crap',
 'No',
 "Then Guillermo says If you go any lighter you're gonna look like an extra on #",
 'You always been this selfish',
 'But',
 "Then that's all you had to say",
 'Well no',
 'You never wanted to go out with me did you',
 'I was',
 'I looked for you back at the party but you always seemed to be occupied',
 'Tons',
 'Have fun tonight',
 'I believe we share an art instructor',
 'You know Chastity',
 'Looks like things worked out tonight huh',
 'Hi'

In [8]:
from sklearn.model_selection import train_test_split
indices = range(len(preprocessed_sentences))
seed = 2092093
train, test, train_indices, test_indices = train_test_split(
    preprocessed_sentences,
    indices,
    train_size = 0.8,
    test_size = 0.2,
    random_state = seed)
test, valid, test_indices, valid_indices = train_test_split(
    test,
    test_indices,
    train_size = 0.5,
    test_size = 0.5,
    random_state = seed)

In [9]:
len(preprocessed_sentences)

511741

In [10]:
len(train_indices)

409392

In [11]:
len(test_indices)

51174

In [12]:
len(valid_indices)

51175

In [13]:
len(train_indices) / len(preprocessed_sentences)

0.7999984367091947

In [14]:
len(test_indices) / len(preprocessed_sentences)

0.09999980458864934

In [15]:
len(valid_indices) / len(preprocessed_sentences)

0.10000175870215597

In [16]:
def write_data(data, file_path):
    with open(file_path, "w") as file_out:
        for line in data:
            file_out.write(line + "\n")

write_data(train, "train.txt")
write_data(test, "test.txt")
write_data(valid, "valid.txt")