# Creating Datasets for "A Doll's House" by Ibsen

## 1. Isolating spoken word from stage directions

In [1]:
import sklearn
import csv
import pandas as pd
from collections import Counter
import re
import string
import nltk

Once regex is added, I will be able to separate out the stage directions. All stage directions are within parentheses. The follow script calls in the file, parses out the exact beginning of the play, and then pulls the character name. The goal is to eventually separate the character name from the connected speech and output them as separate columns in a .csv file.

In [2]:
filein = open('dollshouse.txt', 'r')
full_text = filein.read()
filein.close()

# This is where the regex should go, because the stage directions can go across paragraphs. This needs to happen before we split the text


In [3]:
spoken = re.sub(r'\((.|\n)+?\)', '', full_text)
linda_pattern = re.compile(r'^Mrs\. Linde\.', re.MULTILINE)
spoken = re.sub(linda_pattern, 'MrsLinde.', spoken)
#print(spoken)
split_text = spoken.split("\n\n")
#print(split_text)

#print(split_text.index("\nACT I"))

spoken_text = split_text[9:] # begins at first line of script
#print(spoken_text)

Cleaning and lowercasing the play script

In [4]:
doll_cleaned = []
for line in spoken_text:
    cleanedline = line
    for p in string.punctuation:
        cleanedline = cleanedline.replace(p," ").lower()
    doll_cleaned.append(cleanedline.lower()+"\n")
print(doll_cleaned)

# from nltk.tokenize import word_tokenize
# tokens = word_tokenize(text)
# # remove all tokens that are not alphabetic
# words = [word for word in tokens if word.isalpha()]

# there's some weird punctuation happening in this script (i.e. Torvald--?)

['nora  hide the christmas tree carefully  helen  be sure the children\ndo not see it until this evening  when it is dressed   how much \n', 'porter  sixpence \n', 'nora  there is a shilling  no  keep the change   yes  he is in  \n', 'helmer   is that my little lark twittering out\nthere \n', 'nora   yes  it is \n', 'helmer  is it my little squirrel bustling about \n', 'nora  yes \n', 'helmer  when did my squirrel come home \n', 'nora  just now   come in here  torvald  and see what i have bought \n', 'helmer  don t disturb me   bought  did you say  all these things  has\nmy little spendthrift been wasting money again \n', 'nora  yes but  torvald  this year we really can let ourselves go\na little  this is the first christmas that we have not needed to\neconomise \n', 'helmer  still  you know  we can t spend money recklessly  nora  yes \ntorvald  we may be a wee bit more reckless now  mayn t we  just a tiny\nwee bit  you are going to have a big salary and earn lots and lots of\nmoney \n

## 2. Getting the names and associated lines for each character

Getting the names of each character that speaks

In [5]:
name = []

for speech in doll_cleaned:
    character_line = speech.split()
    #print(speech)
    if len(character_line) > 1:
        # splits each speech into words
        #print("hello from the if statement")
        character_name = character_line[0] # grabs first word, which will always be name of character speaking
        name.append(character_name)
        
#print(name)
#print(len(updated_name))

Getting the speech content of each character that speaks, minus their name

In [6]:
character_speech_list = []
for speech in doll_cleaned:
    if len(speech) > 1:
        character_line = speech.split()
        character_speech = character_line[1:]
        character_speech_list.append(character_speech)

#print(character_speech_list)
#print(len(character_speech_list))

Writing out to a .csv file

In [7]:
headers = ['character_name', 'character_speech']

with open('characters_and_speeches.csv', 'w', newline='') as outfile:
    csvout = csv.writer(outfile)
    csvout.writerow(headers)
    csvout.writerows(zip(name, character_speech_list))

## 3. Getting the counts for each word in the playscript

Getting the counts of each word in the play script. I still need to create a "stop list" for the subject-based words. And then the following script will change.

In [8]:
stop_words = ['torvald', 'helmer', 'nora', 'rank', 'linde', 'mrs', 'nils', 'krogstad', 'anne', 's', 't', 'christine', 'emmy']

In [9]:
word_counts = Counter()
for line in character_speech_list:
    for word in line:
        word_counts[word] = word_counts.get(word, 0) + 1

print(word_counts)

Counter({'you': 986, 'i': 953, 'to': 612, 'it': 568, 'the': 520, 'that': 480, 'and': 469, 'is': 423, 'a': 391, 'of': 366, 'have': 311, 'me': 301, 't': 267, 'in': 260, 'what': 237, 'no': 228, 'for': 215, 'but': 210, 'my': 205, 'be': 189, 'was': 188, 'do': 184, 'not': 181, 'so': 180, 'your': 179, 'he': 177, 'as': 174, 'yes': 166, 'will': 164, 'are': 146, 'all': 128, 's': 127, 'can': 126, 'nora': 126, 'am': 126, 'torvald': 125, 'there': 122, 'at': 116, 'with': 112, 'now': 109, 'must': 108, 'this': 104, 'know': 100, 'think': 94, 'one': 93, 'little': 92, 'if': 92, 'here': 87, 'well': 86, 'don': 85, 'had': 85, 'then': 82, 'on': 82, 'him': 81, 'would': 80, 'come': 79, 'has': 79, 'we': 79, 'tell': 78, 'about': 75, 'how': 71, 'very': 69, 'when': 65, 'been': 64, 'shall': 63, 'never': 62, 'see': 61, 'just': 60, 'thing': 57, 'any': 56, 'from': 56, 'go': 55, 'only': 55, 'should': 53, 'husband': 53, 'who': 52, 'christine': 51, 'his': 51, 'out': 50, 'like': 50, 'look': 50, 'say': 49, 'something': 49,