# Creating Datasets for "A Doll's House" by Ibsen

## 1. Isolating spoken word from stage directions

In [1]:
import sklearn
import csv
import pandas as pd
from collections import Counter
import re
import string
import nltk

Opening the file. Using regex to remove the stage directions from the play script. All stage directions exist within parentheses, both within and outside spoken text. Then, regex is added to match a pattern for a single character, "Mrs. Linde," and combine her name into a single word. This helps with the parsing of the document.

In [2]:
filein = open('dollshouse.txt', 'r')
full_text = filein.read()
filein.close()

In [3]:
spoken = re.sub(r'\((.|\n)+?\)', '', full_text)
linda_pattern = re.compile(r'^Mrs\. Linde\.', re.MULTILINE)
spoken = re.sub(linda_pattern, 'MrsLinde.', spoken)
#print(spoken)
split_text = spoken.split("\n\n")
#print(split_text)

#print(split_text.index("\nACT I"))  <--- finding the first line of the script

spoken_text = split_text[9:] # begins at first line of script
#print(spoken_text)

Cleaning and lowercasing the play script

In [4]:
doll_cleaned = []
for line in spoken_text:
    cleanedline = line
    for p in string.punctuation:
        cleanedline = cleanedline.replace(p," ").lower()
    doll_cleaned.append(cleanedline.lower()+"\n")
#print(doll_cleaned)

# from nltk.tokenize import word_tokenize
# tokens = word_tokenize(text)
# # remove all tokens that are not alphabetic
# words = [word for word in tokens if word.isalpha()]

# there's some weird punctuation happening in this script (i.e. Torvald--?)

## 2. Getting the names and associated lines for each character

Getting the names of each character that speaks

In [5]:
name = []

for speech in doll_cleaned:
    character_line = speech.split()
    #print(speech)
    if len(character_line) > 1:
        # splits each speech into words
        #print("hello from the if statement")
        character_name = character_line[0] + " " # grabs first word, which will always be name of character speaking
        name.append(character_name)
        #name += character_name
        
#print(name)
#print(len(updated_name))

Getting the speech content of each character that speaks. Their name will be included within this section. Names will be filtered out when getting the counts for each word, so the inclusion of the name here is inconsequential.

In [11]:
character_speech_list = []
for speech in doll_cleaned:
    if len(speech) > 1:
        character_speech_list.append(speech.strip())
#         character_line = speech.split()
#         #character_speech = character_line[1:]
        #character_speech_list+=speech


#print(character_speech_list)
# print(len(character_speech_list))

Writing out to a .csv file. The output file will contain 2 columns (character name and the associated line). The output file will have 1288 rows containing textual data. While the character name column contains repeated values (the cast of characters, which will eventually become the class when applying logistic regression), the associated line column should contain entirely unique values.

In [12]:
headers = ['character_name', 'character_speech']

with open('characters_and_speeches-English.csv', 'w', newline='') as outfile:
    csvout = csv.writer(outfile)
    csvout.writerow(headers)
    csvout.writerows(zip(name, character_speech_list))

In [13]:
english_dataset = pd.read_csv("characters_and_speeches-English.csv")
english_dataset.head()

Unnamed: 0,character_name,character_speech
0,nora,nora hide the christmas tree carefully helen...
1,porter,porter sixpence
2,nora,nora there is a shilling no keep the change...
3,helmer,helmer is that my little lark twittering out...
4,nora,nora yes it is


## 3. Getting the counts for each word in the playscript

Getting the counts of each word in the play script. I still need to create a "stop list" for the subject-based words. And then the following script will change.

In [9]:
character_words = []
for line in character_speech_list:
    split_character = line.split()
    character_words.append(split_character)
    #character_words += split_character

In [10]:
topical_words = ['torvald', 'helmer', 'nora', 'rank', 'linde', 'mrs', 'nils', 'krogstad', 'anne', 's', 't', 'mrslinde', 
              'christine', 'emmy', 'husband', 'children', 'father', 'mother', 'man', 'doctor', 'home', 
              'money', 'letter', 'poor', 'papa', 'christmas', 'live', 'love', 'tomorrow', 'evening', 'happy', 
              'impossible', 'himself', 'herself', 'woman', 'sake', 'darling', 'mr', 'tonight', 'darling', 'fancy', 
              'goodnight', 'fun', 'ma', 'child', 'goodbye', 'yesterday', 'helen', 'death', 'forgotten', 'reason', 
              'death', 'champagne', 'loved', 'tarantella', 'tree', 'promise', 'forgiven', 'anne', 'doll', 'skylark', 
              'lawyer', 'paper', 'brother', 'sister', 'morality', 'punished', 'destroyed', 'mistress', 'wedlock', 'liar',
                'nurse', 'wife', 'bank', 'maid', 'family', 'jealous', 'mamma', 'women', 'men', 'maiden', 'office']

# will need a better way of cleaning punctuation so apostrophe's and hyphenates do not get erased
#print(character_words)

word_counts = Counter()
for line in character_words:
    for word in line:
        if word not in topical_words:
            word_counts[word] = word_counts.get(word, 0) + 1

#print(word_counts)