# Creating Datasets for "A Doll's House" by Ibsen

## 1. Isolating spoken word from stage directions

In [None]:
import sklearn
import csv
import pandas as pd
from collections import Counter
import re
import string
import nltk

Once regex is added, I will be able to separate out the stage directions. All stage directions are within parentheses. The follow script calls in the file, parses out the exact beginning of the play, and then pulls the character name. The goal is to eventually separate the character name from the connected speech and output them as separate columns in a .csv file.

In [None]:
filein = open('dollshouse.txt', 'r')
full_text = filein.read()
filein.close()

# This is where the regex should go, because the stage directions can go across paragraphs. This needs to happen before we split the text


In [None]:
spoken = re.sub(r'\((.|\n)+?\)', '', full_text)
#print(spoken)
split_text = spoken.split("\n\n")
#print(split_text)

#print(split_text.index("\nACT I"))

spoken_text = split_text[9:] # begins at first line of script
#print(spoken_text)

Cleaning and lowercasing the play script

In [None]:
doll_cleaned = ""
for line in spoken_text:
    cleanedline = line
    for p in string.punctuation:
        cleanedline = cleanedline.replace(p," ").lower()
    doll_cleaned += cleanedline.lower()
print(doll_cleaned)

# there's some weird punctuation happening in this script (i.e. Torvald--?)

## 2. Getting the names and associated lines for each character

Getting the names of each character that speaks

In [None]:
name = []

for speech in doll_cleaned:
    if len(speech) > 1:
        character_line = speech.split() # splits each speech into words
        print(speech)
        character_name = character_line[0] # grabs first word, which will always be name of character speaking
        name.append(character_name) # not fool-proof. does not pull multiple (i.e. Mrs. Linde). Will also pull any connecting punctuation
        # what would make this better is, once the regex is in, I can grab everything up until the first period that occurs, that way a full name will be separated (still doesn't account for Mrs. Linde. Maybe I need to make a list for it to pull from?)
        # I should .lower everything and remove punctuation
        
#print(name)
#print(len(name))

Getting the speech content of each character that speaks, minus their name

In [None]:
character_speech_list = []
for speech in doll_cleaned:
    if len(speech) > 1:
        character_line = speech.split()
        character_speech = character_line[1:]
        character_speech_list.append(character_speech)

print(character_speech_list)
#print(len(character_speech_list))

Writing out to a .csv file

In [None]:
headers = ['character_name', 'character_speech']

with open('characters_and_speeches.csv', 'w', newline='') as outfile:
    csvout = csv.writer(outfile)
    csvout.writerow(headers)
    csvout.writerows(zip(name, character_speech_list))

## 3. Getting the counts for each word in the playscript

Getting the counts of each word in the play script. I still need to create a "stop list" for the subject-based words. And then the following script will change.

In [None]:
stop_words = ['Linde', 'Mrs.', 'Torvik']

In [None]:
word_counts = Counter()
for line in character_speech_list:
    for word in line:
        word_counts[word] = word_counts.get(word, 0) + 1

print(word_counts)