In [1]:
lines = []
with open("./sherlock_script_lines.txt", "r") as file_in:
    for line in file_in:
        line = line[:-1].strip()
        if (line != ""):
            lines.append(line)

In [2]:
lines[:100]

['How fresh?',
 "Fine. We'll start with the riding crop.",
 "I need to know what bruises form in the next twenty minutes. A man's alibi depends on it. Text me.",
 "Are you wearing lipstick? You weren't wearing lipstick before.",
 'Sorry, you were saying?',
 "Black, two sugars, please. I'll be upstairs.",
 "Mike, can I borrow your phone? There's no signal on mine.",
 'I prefer to text.',
 'Oh. Thank you.',
 'Afghanistan or Iraq?',
 'Which was it - Afghanistan or Iraq?',
 'Ah, Molly, coffee. Thank you. What happened to the lipstick?',
 "Really? I thought it was a big improvement. Your mouth's too small now.",
 'How do you feel about the violin?',
 "I play the violin when I'm thinking. Sometimes I don't talk for days on end. Would that bother you? Potential flatmates should know the worst about each other.",
 "I did. Told Mike this morning that I must be a difficult man to find a flatmate for. Now here he is just after lunch with an old friend, clearly just home from military service in A

In [3]:
from nltk import sent_tokenize
import unidecode
import re

preprocessed_sentences = []

def preprocess_sentence(sentence):
    sentence = unidecode.unidecode(sentence)
    sentence = re.sub(r"[-+]?(\d+([.,]\d*)?|[.,]\d+)([eE][-+]?\d+)?", r"#", sentence)
    sentence = re.sub(r"<[/]?[a-z0-9]+>", r"", sentence)
    sentence = re.sub(r"&quot;", r"", sentence)
    sentence = re.sub(r"&amp;", r"", sentence)
    sentence = re.sub(r"&lt;", r"", sentence)
    sentence = re.sub(r"&gt;", r"", sentence)
    sentence = re.sub(r"&emdash;", r"", sentence)
    sentence = re.sub(r"`", r"'", sentence)
    sentence = re.sub(r"[\"-]+", r" ", sentence)
    sentence = re.sub(r"[ ]+['][ ]+", r"'", sentence)
    sentence = re.sub(r"[ ]+['](?=[a-zA-Z ])", r" ", sentence)
    sentence = re.sub(r"(?<=[a-zA-Z])['][ ]+", r" ", sentence)
    sentence = re.sub(r"[\.]{2,}", r" ", sentence)
    sentence = re.sub(r"[$%&=|~<>/_\^\[\]{}():;,+*!?]+", r" ", sentence)
    sentence = re.sub(r"[ ]+", r" ", sentence)
    sentence = sentence.strip()
    sentence = re.sub(r"(?<=[a-zA-Z])[']$", r"", sentence)
    sentence = re.sub(r"^['](?=[a-zA-Z])", r"", sentence)
    sentence = re.sub(r"[\.][']$", r"", sentence)
    sentence = re.sub(r"['][\.]$", r"", sentence)
    sentence = re.sub(r"^[ ]", r"", sentence)
    sentence = re.sub(r"[ ]$", r"", sentence)
    sentence = re.sub(r"[\.]$", r"", sentence)
    sentence = sentence.strip()
    return sentence
            
def preprocess_and_append_line(line):
    sentences = sent_tokenize(line)
    for sentence in sentences:
        sentence = sentence.strip()
        sentence = preprocess_sentence(sentence)
        if (sentence != ""):
            preprocessed_sentences.append(sentence)

for line in lines:
    preprocess_and_append_line(line)

In [4]:
chars = set()
for s in preprocessed_sentences:
    for c in s:
        chars.add(c)

chars

{' ',
 '#',
 "'",
 '.',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [5]:
preprocessed_sentences[:500]

['How fresh',
 'Fine',
 "We'll start with the riding crop",
 'I need to know what bruises form in the next twenty minutes',
 "A man's alibi depends on it",
 'Text me',
 'Are you wearing lipstick',
 "You weren't wearing lipstick before",
 'Sorry you were saying',
 'Black two sugars please',
 "I'll be upstairs",
 'Mike can I borrow your phone',
 "There's no signal on mine",
 'I prefer to text',
 'Oh',
 'Thank you',
 'Afghanistan or Iraq',
 'Which was it Afghanistan or Iraq',
 'Ah Molly coffee',
 'Thank you',
 'What happened to the lipstick',
 'Really',
 'I thought it was a big improvement',
 "Your mouth's too small now",
 'How do you feel about the violin',
 "I play the violin when I'm thinking",
 "Sometimes I don't talk for days on end",
 'Would that bother you',
 'Potential flatmates should know the worst about each other',
 'I did',
 'Told Mike this morning that I must be a difficult man to find a flatmate for',
 'Now here he is just after lunch with an old friend clearly just home fr

In [6]:
from sklearn.model_selection import train_test_split
indices = range(len(preprocessed_sentences))
seed = 2092093
train, test, train_indices, test_indices = train_test_split(
    preprocessed_sentences,
    indices,
    train_size = 0.8,
    test_size = 0.2,
    random_state = seed)
test, valid, test_indices, valid_indices = train_test_split(
    test,
    test_indices,
    train_size = 0.5,
    test_size = 0.5,
    random_state = seed)

In [7]:
len(preprocessed_sentences)

7677

In [8]:
len(train_indices)

6141

In [9]:
len(test_indices)

768

In [10]:
len(valid_indices)

768

In [11]:
len(train_indices) / len(preprocessed_sentences)

0.7999218444704963

In [12]:
len(test_indices) / len(preprocessed_sentences)

0.10003907776475186

In [13]:
len(valid_indices) / len(preprocessed_sentences)

0.10003907776475186

In [14]:
def write_data(data, file_path):
    with open(file_path, "w") as file_out:
        for line in data:
            file_out.write(line + "\n")

write_data(train, "train.txt")
write_data(test, "test.txt")
write_data(valid, "valid.txt")