This notebook contains the pipeline for processing French TEI files.

In [1]:
import os
import pandas as pd
from os import listdir
from os import path
from os.path import isfile, join
import numpy as np
import string 
from bs4 import BeautifulSoup as bs
import string
import json
import re
import copy
from collections import Counter

In [2]:
def create_character_cast(play_soup):
    """
    The function creates a dictionary where the keys are dramatic characters and values are their alternative names 
    and collective_numbers if applicable.
    Params:
        play_soup - play xml turned into beautiful soup object.
    Returns:
        character_dict - the dictionary with dramatic character info.
    """
    dramatic_characters = play_soup.find_all('castitem')
    character_dict = {}
    for character_tag in dramatic_characters:
        role = character_tag.find_all('role')
        tag = str(role[0])
        xml_id = tag[tag.find('id='):tag.find(' sex')].replace('\"', '').split('=')[-1]
        #in case there is a collective number 
        collective_number = character_tag.find_all('collective_number')
        if len(collective_number) != 0:
            collective_number = int(collective_number[0].get_text())
        else:
            collective_number = None
        character_dict[role[0].get_text()] = {"alternative_names": xml_id,
                                                   "collective_number": collective_number}   
    return character_dict

In [3]:
def get_scene_status(scene):
    """
    The function identifies whether a scene is "regular" (i.e., as presented in the publication) or "extra" 
    according to our custom markup reflecting Iarkho's division into scenes.
    Params:
        scene - beautiful soup object of a scene xml.
    Returns:
        scene_status - a string that can be either "regular" or "extra"
    """
    scene_str = str(scene)
    scene_desc = scene_str[scene_str.find('type='):scene_str.find('>')].replace('\"', '').split('=')[-1]
    if scene_desc.count('extra') > 0:
        scene_status = 'extra'
    else:
        scene_status = 'regular'
        
    return scene_status

In [4]:
def handle_scene_name_and_count(scene, sc_num, extra_scene_number):
    """
    The function checks the scene status, whether it is extra or not and assigns the number. Extra scenes are counted
    as for example 1.1 the first extra scene of the main scene 1.
    Params:
        scene - text of a scene.
        sc_number - number of the scene as it appears in the order of all scenes for a particular act.
        extra_scene_number - the number of the extra scene for each main scene, e.g. 1.1, 1.2, 1.3 etc. 
    Returns:
        scene_status - whether a scene is regular, no_change, or extra.
        sc_number - number of the main scene.
        extra_scene_number - number of the extra scene.
    """
    sc_num = int(float(sc_num))
    scene_status = get_scene_status(scene)
    if scene_status == 'extra':
        sc_num = str(sc_num)+ '.'+str(extra_scene_number)
        extra_scene_number+=1
    else:
        sc_num +=1
        extra_scene_number = 1
        
    return scene_status, sc_num, extra_scene_number

In [5]:
def find_speakers(scene):
    """
    The function creates a list of the speakers in a scene in the order of their utterances. 
    The number of times a speaker appears in the list corresponds to the number of utterances the speaker makes.
    Params:
        scene - a beautiful soup object of the scene xml.
    Returns:
        speakers_lst - a list of speakers in the scene.
    """
    speakers_lst = []
    speakers = [utterance['who'] for utterance in scene.find_all('sp')]
    for speaker in speakers:
        speaker_count = str(speaker).count(',')
        #if multiple speakers
        if speaker_count > 0:
            multiple_speakers = str(speaker).split(',')
            [speakers_lst.append(sp.strip()) for sp in multiple_speakers]
        else:
            speakers_lst.append(speaker)
    
    return speakers_lst

In [6]:
def check_cast_vs_speakers(scene_cast_lst, speakers, scene):
    """
    The function helps check for errors in the publication when a dramatic character speaks in a particular scene
    but is not listed in the scene cast.
    Params:
        scene_cast_lst - a list of dramatic characters who are present in the scene.
        speakers - a list of speakers in the scene.
        scene - a beautiful soup object of the scene xml.
    Returns:
        No return, prints an error and details about the characters who speak but are not listed as presesnt 
        as well as the beginning of the scene where this error is.
    """
    speaker_set = set(speakers)
    scene_cast_set = set(scene_cast_lst)
    if len(speaker_set.difference(scene_cast_set)) > 0 and len(scene_cast_lst) >0:
        print('\tERROR.', 'Speak but do not appear in scene cast:',
              speaker_set.difference(scene_cast_set), 
              'Beginning of the scene:', 
              scene_cast_set, str(scene)[:70])

In [7]:
def tackle_name(character_cast_dict, scene_cast, scene):
    """
    The function identifies which dramatic characters appear in the scene cast.
    Params:
        character_cast_dict - a dictionary with dramatic characters for the play.
        scene_cast - a string that contains information about the dramatic characters in the scene.
        scene - a string with the text of the scene that is needed to check if we have speakers who are not in cast.
    Returns:
        sorted_characters - a list of dramatic characters that appear in the scene in the order they are given in the 
                            scene_cast string.
    """
    updated_characters = []
    for name in character_cast_dict.keys():
        index = scene_cast.find(name.lower())
        if index != -1:
            updated_characters.append((name, index))
    sorted_characters = [element[0] for element in sorted(updated_characters, key = lambda x: x[1])]
    
    return sorted_characters

In [8]:
def count_characters(scene_summary_dict):
    """
    The function parses scene_summary_dict with information about number of utterances by each character
    and identifies the total number of speakers and non_speakers.
    Params:
        scene_summary_dict: a dictionary where keys are dramatic characters and values are number of utterances.
    Returns:
        num_speakers - a number of speaking dramatic characters in the scene.
        perc_non_speakers - percentage of non-speaking dramatic characters in the scene.
    """
    summary = [item for item in scene_summary_dict.items() if item[0] != 'num_utterances' and item[0] != 'num_speakers']
    num_speakers = len([item[0] for item in summary if item[1] != 0])
    num_non_speakers = len([item[0] for item in summary if item[1] == 0]) 
    perc_non_speakers =  round((num_non_speakers / len(summary)) * 100, 3)
    
    return num_speakers, perc_non_speakers

In [9]:
def check_if_no_change(current_scene_cast, previous_cast, scene_status):
    """
    The function checks if the cast for the new scene is different from the previous scene.
    Params:
        scene_names - a list of scenes names in the order they appear in the text.
        scene_cast - a list of characters in the scene.
        complete_scene_info - a dictionary where keys are scene_names and 
                                values are characters and their utterance counts.
        scene_status - whether a scene is regular or extra.
    Returns:
        scene_status - a string, updated in case to 'no_change' if the character cast did not change.
    
    """

    # compare the current scene cast with the cast of the previous scene
    no_change = compare_two_scenes(current_scene_cast, previous_cast)
    if no_change:
        scene_status = no_change   
    
    return scene_status

In [10]:
def compare_two_scenes(cast_one, cast_two):
    """
    The function helps identify if the dramatic character cast changed.
    Params:
        cast_one - a list of characters in scene one.
        cast_two - a list of characters in scene two.
    Returns:
        no_change_scene - 'no_change' if two scenes are the same, None otherwise.
    """
    if set(cast_one) == set(cast_two):
        no_change_scene = 'no_change'
    else:
        no_change_scene = None
        
    return no_change_scene

In [11]:
def parse_scenes(scenes, character_cast_dictionary):
    """
    The function goes through a list of scenes and updates complete_scene_info dictionary with informtion
    about each scene speaking characters, their utterance counts, and percentage of non-speaking characters.
    Params:
        scenes - a list scenes.
        name_pattern - regex expression for identifying character names.
        character_cast_dictionary, reverse_character_cast - dictionaries for lookup of alternative names 
                                                            for each dramatic character.
    Returns:
        complete_scene_info - a dictionary where keys are scenes and values are dramatic characters and their 
                             utternace counts as well as the number of speakers and percentage of non-speakers.
    """
    other_meta_fields = ['num_speakers', 'perc_non_speakers', 'num_utterances']
    complete_scene_info = {} 
    scene_names = []
    sc_num = 0
    extra_scene_number = 1
    for scene in scenes:
        scene_status, sc_num, extra_scene_number = handle_scene_name_and_count(scene, sc_num, extra_scene_number)
        if sc_num != 1 :
            previous_cast = [name for name in complete_scene_info[scene_names[-1]].keys()
                            if name not in other_meta_fields]
        else:
            previous_cast = []
        scene_summary, scene_cast = count_utterances(scene, character_cast_dictionary, previous_cast, scene_status)
        scene_summary['num_utterances'] = sum(list(scene_summary.values()))
        scene_summary['num_speakers'], scene_summary['perc_non_speakers'] = count_characters(scene_summary)
        if float(sc_num) > 1:
            current_scene = [key for key in scene_summary.keys() if key not in other_meta_fields]
            scene_status = check_if_no_change(current_scene, previous_cast, scene_status)           
        complete_scene_info[str(sc_num) + '_' + str(scene_status)] =  scene_summary
        #check to make sure all character names are in scene cast as they appear in the play cast
        scene_names.append(str(sc_num) + '_' + str(scene_status))

    return complete_scene_info

In [12]:
def remove_excluded_characters(character_cast_dict, scene_cast_string, scene):
    """
    The function removes characters who are not present in the scene, as marked by 'excepté', 'moins'.
    Params:
        scene_cast_string - a string that contains the characters who are present in the scene.
    Returns:
        scene_cast_string - without excluded characters, if applicable.
    """
    markers = ['excepté', 'moins']
    for marker in markers:
        if scene_cast_string.lower().count(marker) > 0: 
            excluded_chars_string = scene_cast_string[scene_cast_string.lower().find(marker):]
            scene_cast_string = scene_cast_string[:scene_cast_string.find(marker)]
            characters_to_exclude = tackle_name(character_cast_dict, excluded_chars_string, scene)
        else:
            characters_to_exclude = []
    
    return scene_cast_string, characters_to_exclude

In [13]:
def identify_scene_cast(scene, scene_status, character_cast_dict):
    """
    The function parses the scene xml and identifes the string that contains the dramatic characters' who are present 
    in the scene as well as the dramatic characters who should be excluded from the cast, i.e., after "excepté" or
    "moins."
    Params:
        scene - a beautiful soup object of the scene xml.
        scene_status - if a scene_status is "extra" or "complex_scene," the character cast would be given in the markup,
                        e.g., cast="FILIPIN, ORONTE," otherwise, it will follow the scene number, 
                        e.g., SCENE I. Filipin, Oronte.
        character_cast_dict - a dictionary where keys are dramatic characters and values are their alterantive names
                              and collective numbers.
    Returns:
        scene_cast - a string that contains the dramaric characters present in the scene.
        exluded_characters - a list of characters who should be removed from the scene cast.
    """
    if scene_status.count('extra') != 0 or str(scene).count('complex_scene') != 0:
        scene_cast = str(scene)[str(scene).find('cast=\"'):str(scene).find('type')].lower()
    else:
        scene_cast = str(scene)[str(scene).find('>')+1:str(scene).find('<sp')].lower()
    # remove excluded characters 
    scene_cast, excluded_characters = remove_excluded_characters(character_cast_dict, scene_cast, scene)
    
    return scene_cast, excluded_characters

In [14]:
def handle_preceding_scene_characters(scene_cast, previous_cast, characters_current_scene, excluded_characters):
    """
    A scene would often mention that some of the characters are the same as the ones from a previous scene. 
    This function would help us identify such cases and dramatic characters.
    Params:
        scene_cast - a string that contains the dramatic characters listed for the scene.
        previous_cast - a list of dramatic characters who appeared in the previous scene.
        characters_current_scene - a list of dramatic characters that are listed for the scene.
        excluded_characters - a list of characters who are listed as exluded.
    Returns:
        updated_characters - dramatic characters from a scene, including the ones from a previous scene,
                            if applicable.
    """
    if scene_cast.count('précédent') > 0 or scene_cast.count('precedent') > 0 or scene_cast.count('même') > 0:
        characters = [name for name in previous_cast if name not in excluded_characters]
    else:
        characters = []
    updated_characters = characters_current_scene + characters  
    
    return updated_characters

In [15]:
def extract_utterances(character_cast_dict, scene):
    """
    The function identifies all utterances in a scene and creates a list of dramatic characters who
    make those utterances.
    Params:
        character_cast_dict - a dictionary where keys are dramatic characters and values are their alterantive names
                              and collective numbers.
        scene - a beautiful soup object of the scene xml.
    Returns:
        utterance_lst - a list of speakers who make utterances in the given scene.
    """
    # look up by alternative name
    reverse_dict = dict(zip([val['alternative_names'] for val in character_cast_dict.values()], 
                            character_cast_dict.keys())) 
    utterance_lst = [reverse_dict[name] for name in find_speakers(scene)]
    
    return utterance_lst

In [16]:
def count_handler(characters, utterance_lst):
    """
    The function counts how many utterances each speaker pronounces.
    Params:
        characters - a list of dramatic characters present in the scene.
        utterance_lst - a list of speakers who make utterances in the given scene.
    Returns:
        scene_info - a dictionary where keys are dramatic characters and values are the number o utterances they make
    """
    scene_info = {}
    # if no cast is given, use the speakers for scene cast
    if len(characters) == 0:
        characters = set(utterance_lst)
    if len(characters) > 1:
        for character in characters:
            scene_info[character] = utterance_lst.count(character)
    else:
        scene_info[utterance_lst[0]] = 1
        
    return scene_info

In [17]:
def count_utterances(scene, character_cast_dict, previous_cast, scene_status):
    """
    The function counts the number of utterances each dramatic character makes in a given scene.
    Params:
        scene - a beautiful soup object of the scene xml.
        character_cast_dict - a dictionary where keys are dramatic characters and values are their alterantive names
                              and collective numbers.
        characters_current_scene - a list of dramatic characters that are listed for the scene.
        excluded_characters - a list of characters who are listed as exluded.
        scene_status - scene_status - whether a scene is regular or extra.
    Returns:
        scene_ino - a dictionary where keys are charcters and values are the number of utterances.
    """
    scene_info = {}
    scene_cast, excluded_characters = identify_scene_cast(scene, scene_status, character_cast_dict)
    current_scene_characters = tackle_name(character_cast_dict, scene_cast, scene)
    # account for dramatic characters from a previous scene re-appearing in the new scene.
    characters = handle_preceding_scene_characters(scene_cast, 
                                                   previous_cast, 
                                                   current_scene_characters, 
                                                   excluded_characters)
    utterance_lst = extract_utterances(character_cast_dict, scene)
    # run a quality check
    check_cast_vs_speakers(characters, utterance_lst, scene)
    # count how many utterances each speaker makes
    scene_info = count_handler(characters, utterance_lst)
        
    return scene_info, characters

In [18]:
def process_summary(soup, character_cast_dictionary):  
    act_info = {}
    acts = soup.find_all('div1', {'type': 'act'})
    for act_num, act in enumerate(acts, 1):
        scenes = act.find_all('div2', {'type': ['scene', 'extra_scene', 'complex_scene']})
        act_info['act'+'_'+str(act_num)] = parse_scenes(scenes, 
                                                        character_cast_dictionary)
    return act_info

In [19]:
def number_present_characters(play_dictionary):
    """
    The function calculates the number of characters present in the play. If a character is listed in cast, but doesn't
    appear on stage, he/she doesn't count.
    Params:
        play_dictionary - a dictioanry with data for the play, which includes the characters present in each scene.
    Returns:
        total_number_present_characters - int.
    """
    all_present_characters = set()
    for key in play_dictionary['play_summary'].keys():
        for scene in play_dictionary['play_summary'][key]:
            for item in play_dictionary['play_summary'][key][scene].keys():
                if item != 'num_utterances' and item != 'num_speakers' and item != 'perc_non_speakers':
                    all_present_characters.add(item)
    total_number_present_characters = 0
    appearing_on_stage = set(play_dictionary['characters']).intersection(all_present_characters)
    for character in appearing_on_stage: 
        coll_number = play_dictionary['characters'][character]['collective_number']
        # if there is a collective number for this character
        if coll_number:
            total_number_present_characters += int(coll_number)
        else:
            total_number_present_characters += 1

    return total_number_present_characters

In [20]:
def estimate_number_scenes(scene_summary):
    """
    The function calcualtes the number of scenes per text and per Iarkho (i.e., as marked by actual dramatic character
    entrances and exits).
    Params:
        scene_summary - a dictionary output of the parse_play function.
    Returns:
        total_number_scenes_per_text - number of scenes as they are printed
        total_number_scenes_iarkho - number of scnes per Iarkho, which he calls mobility coefficient (MC)
    """
    total_number_scenes_per_text = 0
    total_number_scenes_iarkho = 0
    for key in scene_summary.keys():
        # get the number of scenes as it is printed in the text
        total_number_scenes_per_text+=len([scene for scene in scene_summary[key].keys() if scene.count('extra')==0])
        # count scenes as marked by actual entrances and exits                                  
        total_number_scenes_iarkho+=len([scene for scene in scene_summary[key].keys() if scene.count('no_change')==0])
    
    return total_number_scenes_per_text, total_number_scenes_iarkho

In [21]:
def number_speaking_no_change_case(previous_scene, no_change_scene):
    """
    The function handles such instances when there is no change in the character cast between two scenes, therefore,
    according to Iarkho's methodology, they should be counted as one scene. The function calculates the number 
    of speakers and percentage of non-speaking characters.
    
    Params:
        previous_scene - the first of the two scenes between which no change of cast happens.
        no_change_scene - the second of the two scenes between which no change of cast happens.
    """
    speaking_set = set()
    non_speaking_set = set()
    characters = [key for key in previous_scene.keys() if key not in ["num_utterances", 
                                                         "num_speakers", 
                                                         "perc_non_speakers"]]
    for key in characters:
        if previous_scene[key] > 0 or no_change_scene[key] > 0:
            speaking_set.add(key)
        if previous_scene[key] == 0 or no_change_scene[key]== 0:
            non_speaking_set.add(key)
    num_non_speaking = len(non_speaking_set.difference(speaking_set))
    num_speaking = len(speaking_set)
    perc_non_speaking = round((num_non_speaking / (num_non_speaking + num_speaking)) * 100, 3)
    
    return num_speaking, perc_non_speaking

In [22]:
def combine_no_change_scenes(play_summary):
    """
    The function keeps track of which scenes should not be counted as separate scenes because no change of cast
    happens but instead should be combined. Number of speakers should be counted in the combined scene.
    Params:
        play_summary - a dictionary where keys are acts and scenes and values are dramatic characters 
        and the number of utterances they make.
    Returns:
        which_to_exlude - acts and scenes which should not be counted as separate scenes.
        speakers - number of speakers in each of the two combined scenes.
        perc_non_speakers - percentage of the non-speaking characters in the combined scene.
    """
    which_to_exclude = []
    speakers = []
    perc_non_speakers = []
    for act in play_summary.keys():
        analysed_scenes = []
        for scene in list(play_summary[act].keys()):
            if scene.count('no_change') > 0:

                num_speaking, perc_non_speaking = number_speaking_no_change_case(
                                                  play_summary[act][analysed_scenes[-1]],
                                                  play_summary[act][scene])
                speakers.append(num_speaking)
                perc_non_speakers.append(perc_non_speaking)
                which_to_exclude.append((act, scene, analysed_scenes[-1]))
            analysed_scenes.append(scene)
    
    return which_to_exclude, speakers, perc_non_speakers

In [23]:
def remove_combined_scenes(play_dict, values_to_exclude):
    """
    The function removes info about scenes that we have previously combined and calculated combined data
    for in cases when there was no change in character cast.
    Params:
        play_dict - a dictionary with speakers for each scene.
        values_to_exlude - a list of typles where the first value is the act and the other values are scenes.
        
    Returns:
        play_dict - without exluded scenes.
    """
    for value in values_to_exclude:
        result = {key : val for key, val in play_dict[value[0]].items() 
                        if key not in value[1:]}
        play_dict[value[0]] = result
        
    return play_dict

In [24]:
def preprocess_play_summary(play_summary_copy):
    """
    The function combines two scenes between which no change of characte cast happens and counts the number of
    speaking characters and percentage of non-speaking characters with such scenes combined into one.
    Params:
        play_summary_copy - copy of the dictionary in which we will replace no change scenes with combined scenes
        for calculating the number of speakers and percentage of non-speakers.
    Returns:
        speakers - a list of numbers of speakers in each scene in the order they appear.
        perc_non_speakers - a list of percentages of non-speakers in each scene in the order they appear.
    """
    values_to_exclude, speakers, perc_non_speakers = combine_no_change_scenes(play_summary_copy)
    play_summary_updated = remove_combined_scenes(play_summary_copy, values_to_exclude)
    for key in play_summary_updated.keys():
        for scene in play_summary_updated[key]:
            speakers.append((play_summary_updated[key][scene]['num_speakers']))
            perc_non_speakers.append(round(play_summary_updated[key][scene]['perc_non_speakers'], 3))
    
    return speakers, perc_non_speakers

In [25]:
def speech_distribution_iarkho(play_summary_copy):
    """
    The function creates speech distrubution per Iarkho, i.e., the number of speaking characters by number of scenes.
    Params:
        play_summary - a dictionary output by parse_play function.
    Returns:
        speech_distribution - a list of tuples were the 0 element is the number of speaking characters
                              and the 1 element is the number of scenes with such number of speaking characters.
    """
    
    speakers, perc_non_speakers = preprocess_play_summary(play_summary_copy)
    counter = Counter
    counted = counter(speakers)
    speech_distribution = sorted(counted.items(), key=lambda pair: pair[0], reverse=False)
    speech_types = percentage_of_each_speech_type(speech_distribution)
    av_perc_non_speakers = round(np.mean((perc_non_speakers)), 3)
    
    return speech_distribution, speech_types, av_perc_non_speakers

In [26]:
def sigma_iarkho(variants, weights):  
    """ 
    The function allows calculating standard range following iarkho's procedure. 
    Parameters: 
        variants - a list with distinct variants in the ascending order, e.g. [1, 2, 3, 4, 5] 
        weights - a list of weights corresponding to these variants, e.g. [20, 32, 18, 9, 1] 
    Returns: 
        sigma - standard range per iarkho 
    """  
    weighted_mean_variants = np.average(variants, weights=weights)  
    differences_squared = [(variant - weighted_mean_variants)**2 for variant in variants] 
    weighted_mean_difference = np.average(differences_squared, weights=weights)  
    sigma = weighted_mean_difference**0.5  
      
    return sigma 

In [27]:
def process_speakers_features(soup, play_data, metadata_dict):
    """
    Iarkho's features described in Iarkho's work on the evolution of 5-act tragedy in verse.
    """
    metadata_dict['num_present_characters'] = number_present_characters(play_data)
    metadata_dict['num_scenes_text'] = estimate_number_scenes(play_data['play_summary'])[0]
    metadata_dict['num_scenes_iarkho'] = estimate_number_scenes(play_data['play_summary'])[1]
    play_summary_copy = copy.deepcopy(play_data['play_summary'])
    distribution, speech_types, non_speakers = speech_distribution_iarkho(play_summary_copy)
    metadata_dict['speech_distribution'] = distribution
    metadata_dict['percentage_monologues'] = speech_types['perc_monologue']
    metadata_dict['percentage_duologues'] = speech_types['perc_duologue']
    metadata_dict['percentage_non_duologues'] = speech_types['perc_non_duologue']
    metadata_dict['percentage_above_two_speakers'] = speech_types['perc_over_two_speakers']
    metadata_dict['av_percentage_non_speakers'] = non_speakers
    metadata_dict['sigma_iarkho'] = round(sigma_iarkho(
                                    [item[0] for item in metadata_dict['speech_distribution']],
                                    [item[1] for item in metadata_dict['speech_distribution']]), 3)
    
    return metadata_dict

In [28]:
def percentage_of_each_speech_type(speech_distribution):
    """
    The function calculates the percentage of each speech type (monologue, duologue, non-duologue (meaning not two
    speakers), and over-two speakers) of the total accross all speech types.
    Params:
        speech_distibution - number of scenes with a specified number of speakers.
    Returns:
        speech_types - a dictionary with percentages corresponding to each speech type.
    """
    speech_types = {}
    total_scenes = np.sum([speech_type[1] for speech_type in  speech_distribution])
    speech_types['perc_monologue'] = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                                    if speech_type[0] ==1]) / total_scenes) *100, 2)
    speech_types['perc_duologue'] = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                                    if speech_type[0] == 2])/ total_scenes) * 100, 2)
    speech_types['perc_non_duologue'] = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                                        if speech_type[0] != 2])/ total_scenes) * 100, 2)
    speech_types['perc_over_two_speakers'] = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                                             if speech_type[0] > 2])/ total_scenes) * 100, 2)
    
    return speech_types

In [29]:
def total_utterances(play_soup):
    """
    The function parses the dictionary with play_summary produced by parse_play function
    and outputs total number of utterances 
    Params:
        play_summary - a dictionary output by parse_play function.
        
    Returns:
        total_utterances_in_play - total number of utterances in a play.
    """
    total_utterances_in_play = len(play_soup.find_all('sp'))       
    
    return total_utterances_in_play

In [30]:
def count_all_verse_lines(soup):
    all_lines = soup.find_all('l')
    not_init = soup.find_all('l', {"part": "m"}) + soup.find_all('l', {"part": "i"})
    num_verse_lines = len([line for line in all_lines if line not in not_init])
    
    return num_verse_lines

In [31]:
def process_features_verse(play_soup, play_data, metadata_dict):
    """
    Iarkho's features described in the work on Corneille's comedies and tragedies.
    """
    metadata_dict['total_utterances'] = total_utterances(play_soup)
    metadata_dict['num_verse_lines'] = count_all_verse_lines(play_soup)
    metadata_dict['dialogue_vivacity'] = round(
                                             metadata_dict['total_utterances'] / 
                                             metadata_dict['num_verse_lines'], 3)
    return metadata_dict

In [32]:
def add_play_info(metadata, custom_flag=False):
    """
    Update play metadata from the metadata_df. We can provide our own metadata or use the TEI metadataa
    """
    play_data = {}
    if custom_flag:
        play_data['title'] = metadata[0][0]
        first_name = metadata[0][2]
        if type(first_name) != float:
            play_data['author'] = str(metadata[0][1] + ', ' + first_name).strip()
        else:
            play_data['author'] = metadata[0][1].strip()
        play_data['creation_date'] = metadata[0][3]
    else:
        play_data['title'] = soup.find('titlepart').get_text()
        play_data['author'] = soup.find('docauthor')['id']
        play_data['creation_date'] = int(soup.find('docdate').get_text().replace('.', ''))
        
    return play_data

In [33]:
def percentage_of_scenes_discont_change(play_soup, play_data, metadata_dict):
    """
    The function calculates percentage of scenes with a discontinuous change of dramatic characters, i.e., when no
    a single dramatic character from the scene 1 re-appears in the next scene, e.g., scene 1. FILIPIN, ANGELIQUE. 
    scene 2. ORONTE.
    Params:
        play_soup - a beautiful soup object of the play text.
        play_data - a dictionary with information about the play.
        metadata_dict - a dictionary where we are storing play features; eventually will be combined with play_data.
    Returns:
        metadata_dict - updated with the new feature, i.e., percentage_scenes_with_discontinuous_change_characters.
    """
    number_scenes = metadata_dict['num_scenes_iarkho']
    characters = []
    num_scenes_with_disc_character_change = 0
    for act in play_data['play_summary'].keys():
        for entry in play_data['play_summary'][act].values():
            new_cast = [item for item in entry.keys() if 
                               item not in ['num_speakers', 'perc_non_speakers', 'num_utterances']]
            if len(characters) > 0:
                if len(set(new_cast).intersection(set(characters[-1]))) == 0:
                    num_scenes_with_disc_character_change += 1
            characters.append(new_cast)
    perc_disc = round((num_scenes_with_disc_character_change /number_scenes) * 100, 3) 
    metadata_dict['number_scenes_with_discontinuous_change_characters'] = num_scenes_with_disc_character_change
    metadata_dict['percentage_scenes_with_discontinuous_change_characters'] = perc_disc
    
    return metadata_dict

In [34]:
def additional_metadata(play_soup, play_data):
    """
    Process all play features.
    """
    metadata_dict = {}
    for process in [process_speakers_features, 
                    process_features_verse, 
                    percentage_of_scenes_discont_change]:
        metadata_dict = process(play_soup, play_data, metadata_dict)

    return metadata_dict

In [35]:
def process_play(file_name, metadata_df, custom_flag):
    """
    The function parses a txt file and creates a summary with features and metadata for the play.
    Params:
        file_name - a string, name of the file with the play text.
        metadata_df - a dataframe containing the info about the play.
    Returns:
        play_data - a dictionary with detailed play summary by scenes, metadata, and features
    """
    print(file_name)
    with open(file_name, 'r') as file:
        soup = bs(file, 'lxml')
    if custom_flag:
        play_index = file_name.replace('TEI_files/', '').replace('.xml', '')
        play_meta = metadata_df[metadata_df['index']==play_index][['title', 'last_name', 
                                                                  'first_name', 'date']].values 
        comedy = open(file_name, 'r') .read()
        number_acts = int(metadata_df[metadata_df['index']==play_index]['num_acts'].values[0])
    else:
        play_meta = []
    play_data = add_play_info(play_meta, custom_flag)
    play_data['characters'] = create_character_cast(soup)
    play_data['play_summary'] = process_summary(soup, play_data['characters'])
    play_data['metadata'] = additional_metadata(soup, play_data)
    
    return play_data

In [36]:
def process_all_plays(input_directory, output_path, custom_flag=False, metadata_path=None):
    """
    The function allows to process all files in a specified directory.
    Params:
        input_directory - the path to the folder containing the txt files
        output_path - directory in which the json summaries will be saved.
        metadata_path - path to the metadata file, a tab-delimited txt file with informtion about all plays.
    Returns:
        no returns, the files will be saved in output_path directory.
    """
    all_files = [f for f in listdir(input_directory) if f.count('.xml')>0]
    if custom_flag:
        metadata_df = pd.read_csv(metadata_path, sep='\t')
    else:
        metadata_df = pd.DataFrame()
    for file in all_files:
        play_data_dict = process_play(input_directory+file, metadata_df, custom_flag)
        json_name = output_path +str(file.replace('.xml', '.json')) 
        with open(json_name, 'w') as fp:
            json.dump(play_data_dict, fp, ensure_ascii=False, indent=2)

In [37]:
process_all_plays('TEI_files/', 'Test/', True, 'French_Comedies.tsv')

TEI_files/F_95.xml
TEI_files/F_56.xml
TEI_files/F_136.xml
TEI_files/F_137.xml
TEI_files/F_123.xml
TEI_files/F_57.xml
TEI_files/F_94.xml
TEI_files/F_82.xml
TEI_files/F_96.xml
TEI_files/F_41.xml
TEI_files/F_69.xml
TEI_files/F_135.xml
TEI_files/F_120.xml
TEI_files/F_68.xml
TEI_files/F_54.xml
TEI_files/F_40.xml
TEI_files/F_78.xml
TEI_files/F_50.xml
TEI_files/F_131.xml
TEI_files/F_119.xml
TEI_files/F_45.xml
TEI_files/F_84.xml
TEI_files/F_53.xml
TEI_files/F_47.xml
TEI_files/F_46.xml
TEI_files/F_21.xml
TEI_files/F_155.xml
TEI_files/F_169.xml
TEI_files/F_3.xml
TEI_files/F_237.xml
TEI_files/F_20.xml
TEI_files/F_34.xml
TEI_files/F_22.xml
TEI_files/F_36.xml
TEI_files/F_156.xml
TEI_files/F_142.xml
TEI_files/F_235.xml
TEI_files/F_1.xml
TEI_files/F_143.xml
TEI_files/F_37.xml
TEI_files/F_23.xml
TEI_files/F_27.xml
TEI_files/F_33.xml
TEI_files/F_153.xml
TEI_files/F_147.xml
TEI_files/F_5.xml
TEI_files/F_185.xml
TEI_files/F_32.xml
TEI_files/F_26.xml
TEI_files/F_18.xml
TEI_files/F_30.xml
TEI_files/F_24.xm

In [38]:
process_play('TEI_files/F_187.xml', pd.read_csv('French_Comedies.tsv', sep='\t'), True)

TEI_files/F_187.xml


{'title': "L'Égoïsme",
 'author': "Cailhava D'estendoux, Jean-François",
 'creation_date': 1777,
 'characters': {'MONSIEUR DE FLORIMON': {'alternative_names': 'FLORIMON',
   'collective_number': None},
  'MADAME DE FLORIMON': {'alternative_names': 'MADAME FLORIMON',
   'collective_number': None},
  'PHILÉMON': {'alternative_names': 'PHILÉMON', 'collective_number': None},
  'LE CHEVALIER': {'alternative_names': 'LE CHEVALIER',
   'collective_number': None},
  'POLIDOR': {'alternative_names': 'POLIDOR', 'collective_number': None},
  'CONSTANCE': {'alternative_names': 'CONSTANCE', 'collective_number': None},
  'MARTON': {'alternative_names': 'MARTON', 'collective_number': None},
  'LA PIERRE': {'alternative_names': 'LA PIERRE', 'collective_number': None},
  'CLERMON': {'alternative_names': 'CLERMON', 'collective_number': None},
  'DURAND': {'alternative_names': 'DURAND', 'collective_number': None},
  'LE NOTAIRE': {'alternative_names': 'LE NOTAIRE', 'collective_number': None},
  'DOMESTIQ