In [1]:
from bs4 import BeautifulSoup
import re
from pprint import pprint
from collections import Counter

In [2]:
#open script (saved as html file)
soup = BeautifulSoup(open('little_women.html'), 'html.parser')

# Extract info from html version of script

## All characters

In [3]:
def char_names(style_att):
    """checks whether there is text at positions where character/speaker names appear in script"""
    if ('left:252.00px' in style_att or 
        'left:190.00px' in style_att or
        'left:152.00px' in style_att or
        'left:377.00px' in style_att or
        'left:401.00px' in style_att or
        'left:426.00px' in style_att or
        'left:422.00px' in style_att or
        'left:415.00px' in style_att or 
        'left:429.00px' in style_att or 
        'left:416.00px' in style_att or 
        'left:405.00px' in style_att or 
        'left:388.00px' in style_att or 
        'left:432.00px' in style_att or 
        'left:159.00px' in style_att):
        return True
    else:
        return False

In [4]:
#creates a list of bs4 tag objects for all the divisions 
#that have style attributes that contain the positions specified by char_names()
all_char = soup.find_all('div', 
                          {
                           'style': lambda x: char_names(x)
                          })

#creates a set of unique characters
#by pulling the strings (character names) from the each of the tag objects
#then striping any leading/trailing spaces
characters = set(map(lambda x: x.text.strip(), all_char))

In [5]:
characters

{'AMY',
 "AMY (CONT'D)",
 'AMY (O.S.)',
 'ANNIE',
 'ASA MELVIN',
 'AUNT MARCH',
 "AUNT MARCH (CONT'D)",
 'AUNT MARCH (O.S.)',
 'BETH',
 "BETH (CONT'D)",
 'BETH (O.S.)',
 'DAISY AND DEMI',
 'DASHWOOD',
 "DASHWOOD (CONT'D)",
 'DASHWOOD (O.S.)',
 'DASHWOOD GIRL #1',
 'DASHWOOD GIRL #2',
 'DASHWOOD GIRL #3',
 'DOCTOR',
 "DOCTOR (CONT'D)",
 'FATHER',
 'FRED',
 'FRIEDRICH',
 "FRIEDRICH (CONT'D)",
 'FRIEDRICH (O.S.)',
 'HANNAH',
 'JO',
 "JO (CONT'D)",
 'JO (O.S.)',
 'JO (V.O.)',
 'JO/AMY/MEG',
 'JOHN',
 "JOHN (CONT'D)",
 'JON',
 'LAURIE',
 "LAURIE (CONT'D)",
 'LAURIE (O.S.)',
 'MARMEE',
 "MARMEE (CONT'D)",
 'MARMEE (O.S.)',
 'MARMEE (V.O.)',
 'MEG',
 "MEG (CONT'D)",
 'MEG (O.S.)',
 'MEG/JO/BETH/AMY',
 'MR. BROOKE',
 'MR. DASHWOOD',
 "MR. DASHWOOD (CONT'D)",
 'MR. DASHWOOD (V.O.)',
 'MR. LAURENCE',
 "MR. LAURENCE (CONT'D)",
 'MR.MARCH',
 'MRS. DASHWOOD',
 'MRS. HUMMEL',
 'MRS. KIRKE',
 'OLIVIA',
 'RAILROAD PORTER',
 'SALES CLERK',
 'SALLIE',
 "SALLIE (CONT'D)",
 'SCHOOL GIRL #1',
 'SCHOOL GIRL

## All dialogue

In [6]:
def check_style(style_att):
    """checks whether there is text at positions where dialogue can appear in script"""
    if ('left:252.00px' in style_att or #middle
        'left:180.00px' in style_att or #middle
        
        'left:426.00px' in style_att or #right
        'left:333.00px' in style_att or #right
        'left:377.00px' in style_att or #right
        'left:401.00px' in style_att or #right
        'left:422.00px' in style_att or #right
        'left:415.00px' in style_att or #right
        'left:429.00px' in style_att or #right
        'left:416.00px' in style_att or #right
        'left:332.00px' in style_att or #right
        'left:405.00px' in style_att or #right
        'left:388.00px' in style_att or #right
        'left:325.00px' in style_att or #right
        'left:367.00px' in style_att or #right
        'left:297.00px' in style_att or #right
        'left:423.00px' in style_att or #right
        
        'left:190.00px' in style_att or #left
        'left:201.00px' in style_att or #left
        'left:197.00px' in style_att or #left
        'left:204.00px' in style_att or #left
        'left:199.00px' in style_att or #left
        'left:152.00px' in style_att or #left
        'left:109.00px' in style_att or #left
        'left:178.00px' in style_att or #left
        'left:159.00px' in style_att or #left
        'left:192.00px' in style_att or #left
        
        'left:108.00px' in style_att): #locations/years
        return True
    else:
        return False
        

In [7]:
#creates a list of bs4 tag objects for all the divisions that have style attributes 
#that contain the positions where there may be dialogue
all_dialogue = soup.find_all('div', 
                          {
                           'style': lambda x: check_style(x)
                          })

In [8]:
#scripts are formatted such that scenes have slug lines, descriptions, and 
#then a character/speaker name will have dialogue beneath that may fall across multiple lines
#and may have a parenthetical between the speaker name

#this script sometimes has characters talking over each other, 
#so two or more character name may fall on the same line

#how this info is extracted is detailed by step

script = []
time_period = 'present'
scene_index = -1

for i, line in enumerate(all_dialogue): #loops through the list of dialogue containing objects
    text =line.text.strip() #pulls the text from the object
    left_dia = {}
    right_dia = {}
    middle_dia ={}
    direction={}
    
    if len(all_dialogue) >i+47: #checks how close the current object is to the last object
            #to group lines of dialogue spoken together, proceeding objects have to be checked too
        stop = 47
    else:
        stop=len(all_dialogue)-i
        
    if text in characters: 
        flag=True #set to true if the pulled text signifies a speaker
    else:
        flag=False #no speaker in the pulled text
        dual_dia=False #does not signify a line with dual dialogue either

    #scene info from the slugline is interesting because the script cuts back and forth through time
    if 'left:108.00px' in str(line): #where the slug lines are located
        if 'INT.' in text or 'EXT.' in text: #but not all objects at this location are slug lines
            slug_line =text
            scene_index+=1 #numbers the scenes
            year = re.findall(r"(18\d{2})", slug_line) #pulls the year from the slug line
            if 'FICTION' in slug_line: #pulls the timeline info from the slugline
                time_period='fiction?'
            elif 'THE PRESENT' in slug_line:
                time_period='present'
            elif 'THE PAST' in slug_line:
                time_period='past'

            if year:
                year = int(year[0]) #cleans the year object
            else:
                year=0 #placeholder for checking/sorting later
            
    if flag: 
        if all_dialogue[i+1].text.strip() in characters: 
            dual_dia = True #two lines in a row are character names, then it's a case of dual dialogue
            char_left=all_dialogue[i].text.strip() #object with the left speaker name
            char_right=all_dialogue[i+1].text.strip() #object with the right speaker name
        else:
            dual_dia = False #not dual dialogue, clears the lists of dialogue
            left = []
            right = []
            middle=[]
            
    if flag==True and dual_dia==False:
        for j in range(i+1,i+stop): #checks for text in proceeding lines
            if all_dialogue[j].text.strip() in characters:
                flag=False 
                break #breaks at the next line containing a character name
            else:
                if 'left:180.00px' in str(all_dialogue[j]): 
                    #checks where dialogue falls in the script when only one character is speaking
                    middle.append(all_dialogue[j].text.strip()) #puts the split up dialogue together in a list
        
        #creates dictionary the dialogue with the corresponding speaker name and scene info
        if middle:
            middle_dia['speaker'] =text.title() 
            middle_dia['text'] = " ".join(middle)
#             middle_dia['page_location']= 'middle' #sanity check
            middle_dia['slug_line']= slug_line
            middle_dia['year']= year
            middle_dia['time_period']= time_period
            middle_dia['scene_index']= scene_index
            script.append(middle_dia) #add to list of dictionaries for dialogue occurance

    if dual_dia: #in cases of dual dialogue (two character names in a row)
        for j in range(i+2,i+stop): #check lines after those two names
            if all_dialogue[j].text.strip() in characters: #break at next character name
                dual_dia= False
                flag=False
                break
            elif 'left:109.00px' in str(all_dialogue[j]): 
                #check location at which dialogue on the left appears and group in a list
                left.append(all_dialogue[j].text.strip())
            elif ('left:333.00px' in str(all_dialogue[j]) or
                  'left:332.00px' in str(all_dialogue[j]) or
                  'left:325.00px' in str(all_dialogue[j]) or
                  'left:297.00px' in str(all_dialogue[j])):
                #check location at which dialogue on the right appears and group in a list
                right.append(all_dialogue[j].text.strip())
         
        #creates dictionary the dialogue with the corresponding speaker name and scene info
        left_dia['speaker']=char_left.title()
        left_dia['text']= " ".join(left)
#         left_dia['page_location']= 'left'
        left_dia['slug_line']= slug_line
        left_dia['year']= year
        left_dia['time_period']= time_period
        left_dia['scene_index']= scene_index

        #creates dictionary the dialogue with the corresponding speaker name and scene info
        right_dia['speaker']=char_right.title()
        right_dia['text']= " ".join(right)
#         right_dia['page_location']= 'right'
        right_dia['slug_line']= slug_line
        right_dia['year']= year
        right_dia['time_period']= time_period
        right_dia['scene_index']= scene_index


        script.append(left_dia) #add to list of dictionaries for dialogue occurance
        script.append(right_dia) #add to list of dictionaries for dialogue occurance

# Clean extracted info

## Remove (says nothing) lines

In [9]:
# #removing dictionaries without text (2x SPEAKER appears but the parenthetical is 'says nothing')
script =[i for i in script if i['text']!=""]

## Add dialogue index

In [10]:
#adding a key/value pair approximating the order of dialogue
for i,line in enumerate(script):
    line['dialogue_index']=i

## Fix years

In [11]:
#in the loop creating script, if there wasn't a year in the slug line,
#the year was set to 0
#checking what lines those were
[i for i in script if i['year']==0]

[{'speaker': 'Aunt March',
  'text': 'Amy? Come here.',
  'slug_line': 'INT. AUNT MARCH’S HOUSE. DAY.',
  'year': 0,
  'time_period': 'past',
  'scene_index': 82,
  'dialogue_index': 881},
 {'speaker': 'Amy',
  'text': 'Yes?',
  'slug_line': 'INT. AUNT MARCH’S HOUSE. DAY.',
  'year': 0,
  'time_period': 'past',
  'scene_index': 82,
  'dialogue_index': 882},
 {'speaker': 'Aunt March',
  'text': 'Come, sit!',
  'slug_line': 'INT. AUNT MARCH’S HOUSE. DAY.',
  'year': 0,
  'time_period': 'past',
  'scene_index': 82,
  'dialogue_index': 883},
 {'speaker': "Aunt March (Cont'D)",
  'text': 'If you are very good, one day this ring will belong to you.',
  'slug_line': 'INT. AUNT MARCH’S HOUSE. DAY.',
  'year': 0,
  'time_period': 'past',
  'scene_index': 82,
  'dialogue_index': 884},
 {'speaker': 'Amy',
  'text': 'Really?',
  'slug_line': 'INT. AUNT MARCH’S HOUSE. DAY.',
  'year': 0,
  'time_period': 'past',
  'scene_index': 82,
  'dialogue_index': 885},
 {'speaker': 'Aunt March',
  'text': 'Yo

In [12]:
# there were only three slug_lines for which year = 0, few enough to 
# examine in the script and manually fix

# 1) INT. AUNT MARCH’S HOUSE. DAY. 
# looks like 1862

# 2) THE PAST. EXT./INT. MARCH HOUSE. MEG & JO’S ROOM. SPRING.
# The slug line spilled over onto a second line of the pdf
# so I manually append 'DAY. 1865.'' to the slug line 
# and set year = 1865

# 3) THE PRESENT IS NOW THE PAST. OR MAYBE FICTION. EXT. TRAIN.
# The slug line spilled over onto a second line of the pdf
# so I manually append (EVENING. 1869.) to the slug line
# and set year = 1869

for i in script:
    if (i['year']==0 and 
        i['slug_line']=='INT. AUNT MARCH’S HOUSE. DAY.'):
        i.update({'year':1862})
        i.update({'slug_line':'INT. AUNT MARCH’S HOUSE. DAY. 1862'})
    elif (i['year']==0 and 
          i['slug_line']=='THE PAST. EXT./INT. MARCH HOUSE. MEG & JO’S ROOM. SPRING.'):
        i.update({'year':1865})
        i.update({'slug_line':'THE PAST. EXT./INT. MARCH HOUSE. MEG & JO’S ROOM. SPRING. DAY. 1865'})
    elif (i['year']==0 and 
          i['slug_line']=='THE PRESENT IS NOW THE PAST. OR MAYBE FICTION. EXT. TRAIN.'):
        i.update({'year':1869})
        i.update({'slug_line':'THE PRESENT IS NOW THE PAST. OR MAYBE FICTION. EXT. TRAIN. EVENING. 1869'})

## Add scene location

In [13]:
def assign_place(dialogue_info):
    for i in dialogue_info:
        if ('TAILOR' in i['slug_line'] or
            'SCHOOL' in i['slug_line'] or
            'HUMMEL' in i['slug_line'] or
            'UNION' in i['slug_line']):
            i['place'] = "INSIDE IN CONCORD"
        elif ('GARDINER' in i['slug_line'] or
              'MOFFAT' in i['slug_line']):
            i['place'] ="FANCY PARTY"
        elif 'AUNT MARCH’S HOUSE' in i['slug_line']:
            i['place'] ="AUNT MARCH'S HOUSE"
        elif ("MEG MARCH’S HOUSE" in i['slug_line'] or 
              "MEG’S HOUSE" in i['slug_line']):
            i['place'] ="MEG'S HOUSE"
        elif ('BEACH' in i['slug_line'] or 
              'SEASHORE' in i['slug_line']):
            i['place'] = 'BEACH'
        elif ('CONCORD TOWN ROAD' in i['slug_line'] or
              'RIVER'  in i['slug_line'] or 
              'WOODS' in i['slug_line'] or 
              'TRAIN' in i['slug_line']):
            i['place'] = 'OUTSIDE IN CONCORD'
        elif ('FRANCE' in i['slug_line'] or 
              'PARIS' in i['slug_line']):
            i['place'] = 'PARIS'
        elif 'PUBLISHING' in i['slug_line']:
            i['place'] = 'PUBLISHING HOUSE'
        elif 'BOARDING HOUSE' in i['slug_line']:
            i['place'] = 'BOARDING HOUSE'
        elif 'LAURENCE' in i['slug_line']:
            i['place'] = 'LAURENCE HOUSE'
        elif ('NEW YORK' in i['slug_line'] or 
              'GERMAN BEER HALL' in i['slug_line']):
            i['place'] = 'NEW YORK'
        elif ('MARCH' in i['slug_line'] or 
              'BETH' in i['slug_line'] or 
              'JO' in i['slug_line']):
            i['place'] = 'MARCH HOUSE'
        else:
            i['place']='idk'
        i['place'] = i['place'].lower()
    return i['place']

In [14]:
assign_place(script)

'publishing house'

## Add time of day

In [15]:
def assign_time_of_day(dialogue_info):
    for i in dialogue_info:
        if ('NIGHT' in i['slug_line'] or 
            'BALLROOM' in i['slug_line']):
            i['time_of_day']='night'
        elif ('DAY' in i['slug_line'] or 
              'EXT. / INT. MARCH HOUSE. 1868.' in i['slug_line'] or 
              'INT. NEW YORK. PUBLISHING OFFICE. 1868.' in i['slug_line'] or
              'AUNT' in i['slug_line']):
            i['time_of_day']='day'
        elif 'AFTERNOON' in i['slug_line'] or 'AFTEROON' in i['slug_line']:
            i['time_of_day']='day'#switched to day for now
        elif ('EVENING' in i['slug_line'] or 
              'JO & MEG’S ROOM. 1861.'in i['slug_line']):
            i['time_of_day']='night'#switched to night
        elif ('MORNING' in i['slug_line'] or 
              'MARCH HOUSE' in i['slug_line']):
            i['time_of_day']='day'#switched to day for now
        else:
            i['time_of_day']='unknown'
    return i['time_of_day']

In [16]:
assign_time_of_day(script)

'day'

## Clean speakers (inconsistent naming and parentheticals)

In [17]:
def clean_speakers(dialogue_info):
    for i in dialogue_info:

    #Marmee (V.O)= reading a letter from Mr. March
    #JO (V.O) 1st reading a play, 2nd/3rd reading a letter to Laurie
    #Mr. Dashwood (V.O.)- he reads his letter to JO

        # clean speaker names that had
        #(CONT'D) or (V.O.) or (O.S)
        i['speaker'] = re.sub(r'( \(.*\))', '', i['speaker'])
        #clean speakers that were inconsistently referred to in script
        if i['speaker']=='Dashwood':
            i['speaker']='Mr. Dashwood'
        elif i['speaker']=='Mr.March' or i['speaker']=='Father':
            i['speaker']='Mr. March'
        elif i['speaker']=='Jon':
            i['speaker']='Jo'
        elif i['speaker']=='Mr. Brooke' or i['speaker']=='John':
            i['speaker']='John Brooke'

In [18]:
clean_speakers(script)

# Check extracted info

## Full script (list of dictionaries for each line of dialogue)

In [19]:
#print script
pprint(script[-1])

{'dialogue_index': 1218,
 'place': 'publishing house',
 'scene_index': 118,
 'slug_line': 'THE PRESENT. INT. PUBLISHING HOUSE. NEW YORK CITY. DAY. 1870.',
 'speaker': 'Jo',
 'text': 'I’ve decided. I want to own my own book.',
 'time_of_day': 'day',
 'time_period': 'present',
 'year': 1870}


In [20]:
#Check speaker and dialouge
# [(i['speaker'],i['text']) for i in script]

## Lines per speaker

In [21]:
char_line_count = Counter(line['speaker'] for line in script)
char_line_count.most_common()

[('Jo', 344),
 ('Amy', 193),
 ('Laurie', 153),
 ('Meg', 131),
 ('Marmee', 81),
 ('Beth', 71),
 ('Friedrich', 47),
 ('Mr. Dashwood', 40),
 ('Aunt March', 40),
 ('John Brooke', 22),
 ('Mr. Laurence', 18),
 ('Hannah', 14),
 ('Mr. March', 10),
 ('Sallie', 7),
 ('Mrs. Kirke', 6),
 ('Annie', 5),
 ('Doctor', 5),
 ('Mrs. Dashwood', 4),
 ('School Girl #1', 3),
 ('Asa Melvin', 3),
 ('School Girl #2', 2),
 ('Fred', 2),
 ('Susan Robbins', 2),
 ('Soldier', 2),
 ('Sales Clerk', 1),
 ('Daisy And Demi', 1),
 ('Olivia', 1),
 ('Viola', 1),
 ('Young Man', 1),
 ('Mrs. Hummel', 1),
 ('Meg/Jo/Beth/Amy', 1),
 ('Railroad Porter', 1),
 ('School Girl #3', 1),
 ('Servant', 1),
 ('Jo/Amy/Meg', 1),
 ('Dashwood Girl #1', 1),
 ('Dashwood Girl #2', 1),
 ('Dashwood Girl #3', 1)]

### Main characters

In [22]:
main_characters= ['Jo', 'Amy', 'Meg','Beth',
#                   'Laurie','Marmee',
#                   'Aunt March','Friedrich', 'Mr. Dashwood'
                 ]

## Lines per scene

In [23]:
#check if the slug lines make sense
Counter(line['slug_line'] for line in script)

Counter({'INT. NEW YORK. PUBLISHING OFFICE. 1868.': 23,
         'EXT./INT. BOARDING HOUSE. NYC. DAY. 1868.': 14,
         'EXT. PARIS PROMENADE. DAY. 1868.': 34,
         'INT. TAILOR SHOP. AFTERNOON. 1868.': 9,
         'EXT. MEG MARCH’S HOUSE. DAY. 1868.': 2,
         'EXT. / INT. MARCH HOUSE. 1868.': 1,
         'INT. NEW YORK THEATRE. EVENING. 1868.': 2,
         'INT. GERMAN BEER HALL. NIGHT. 1868.': 2,
         'THE PAST. INT. CONCORD. MARCH HOUSE. JO & MEG’S ROOM. 1861.': 24,
         "INT. GARDINER'S NEW YEAR’S PARTY. HALLWAY. NIGHT. 1861.": 3,
         'INT. GARDINER’S NEW YEAR’S PARTY. CONCORD. NIGHT. 1861.': 25,
         'EXT. GARDINER’S NEW YEAR’S PARTY. PORCH. NIGHT. 1861.': 1,
         'INT. GARDINER’S NEW YEAR’S PARTY. HALLWAY. NIGHT. 1861.': 9,
         'INT./EXT. MARCH HOUSE. NIGHT. 1861.': 19,
         'THE PRESENT. EXT./INT. NEW YORK BOARDING HOUSE. NIGHT. 1868.': 1,
         'INT. PARIS. BALLROOM. 1868.': 26,
         'INT. BOARDING HOUSE DRAWING ROOM. NYC. DAY. 18

## Lines per location

In [24]:
#check that the settings are grouped reasonably
Counter(line['place'] for line in script)

Counter({'publishing house': 69,
         'boarding house': 82,
         'paris': 150,
         'inside in concord': 35,
         "meg's house": 15,
         'march house': 521,
         'new york': 16,
         'fancy party': 69,
         'laurence house': 83,
         'outside in concord': 64,
         "aunt march's house": 62,
         'beach': 53})

## Lines grouped by time

In [25]:
#check if the years make sense
Counter(line['year'] for line in script)

Counter({1868: 152,
         1861: 191,
         1869: 350,
         1862: 394,
         1865: 40,
         1866: 25,
         1867: 22,
         1870: 45})

In [26]:
#check if the time_periods make sense
Counter(line['time_period'] for line in script)

Counter({'present': 480, 'past': 732, 'fiction?': 7})

In [27]:
#check time of day
Counter(line['time_of_day'] for line in script)
#morning, afternoon, and evening didn't have many scenes so collapsed

Counter({'day': 960, 'night': 259})

# Write to json

In [28]:
#write script (list of dictionaries) to json file
with open('little_women.json', 'w') as f:
    json.dump(script, f)