In this notebook, we process TEI files.

In [1]:
import os
import pandas as pd
from os import listdir
from os import path
from os.path import isfile, join
import numpy as np
import string 
from bs4 import BeautifulSoup as bs
import string
import json
import re
from collections import Counter

In [2]:
def create_character_cast(play_soup):
    dramatic_characters = play_soup.find_all('person')
    character_dict = {}
    for character_tag in dramatic_characters:
        tag = str(character_tag)
        xml_id = tag[tag.find('xml:id='):tag.find('>')].replace('\"', '').split('=')[-1]
        # in case there is a collective number 
        collective_number = character_tag.find_all('collective_number')
        if len(collective_number) != 0:
            collect_number = int(collective_number[0].get_text())
        else:
            collect_number = None
        character_dict[character_tag.find_all('persname')[0].get_text()] = {"alternative_names": xml_id,
                                                                            "collective_number": collect_number}   
    return character_dict

In [3]:
def handle_scene_name_and_count(scene, sc_num, extra_scene_number):
    """
    The function checks the scene status, whether it is extra or not and assigns the number. Extra scenes are counted
    as for example 1.1 the first extra scene of the main scene 1.
    Params:
        scene - text of a scene.
        sc_number - number of the scene as it appears in the order of all scenes for a particular act.
        extra_scene_number - the number of the extra scene for each main scene, e.g. 1.1, 1.2, 1.3 etc. 
    Returns:
        scene_status - whether a scene is regular, no_change, or extra.
        sc_number - number of the main scene.
        extra_scene_number - number of the extra scene.
    """
    sc_num = int(float(sc_num))
    scene_status = get_scene_status(scene)
    if scene_status == 'extra':
        sc_num = str(sc_num)+ '.'+str(extra_scene_number)
        extra_scene_number+=1
    else:
        sc_num +=1
        extra_scene_number = 1
        
    return scene_status, sc_num, extra_scene_number

In [4]:
def check_utterance(scene):
    utterance_dict = {}
    utterances = scene.find_all('sp')
    for utterance in utterances:
        speaker_count = str(utterance).count('#')
        if speaker_count > 1:
            speaker_string = str(utterance)[str(utterance).find('#'):str(utterance).find('\">')]
            speakers = speaker_string.split(' ')
            for speaker in speakers:
                utterance_dict[speaker] = speaker_string
        
    return utterance_dict

In [5]:
def multi_word_name(character_cast_dict):
    multi_word = []
    for key in character_cast_dict.keys():
        if key.count(' ') > 0:
            multi_word.append(key)
    if len(multi_word) > 0:
        
        return True
    else:
        return False

In [6]:
def count_utterances(scene, character_cast_dict, previous_cast, scene_status):
    scene_info = {}
    items_to_replace = ['.', 'Те же', 'Те ж', 'Все',
                        'Прежние', 'один', 'одна', ('('), (')'), 
                        '\"', '=', 'cast', 'кроме', '\n', '\xa0', ';', 'потом']
    if scene_status.count('extra') != 0 or str(scene).count('complex_scene') != 0:
        scene_cast = str(scene)[str(scene).find('cast=\"'):str(scene).find('type')].lower()
    else:
        scene_cast = scene.find_all('stage')[0].get_text().lower()
    if scene_cast.count('те') > 0 or scene_cast.count('Прежние') > 0 or scene_cast.count('Все') > 0:
        characters = previous_cast
    else:
        characters = []
    updated_characters = []
    for name in character_cast_dict.keys():
        if scene_cast.find(name.lower()) != -1:
            updated_characters.append(name)
    utterance_dictionary = check_utterance(scene)
    # make sure to include previous cast in case some of the characters are the same
    updated_characters = updated_characters + characters
    if len(updated_characters) > 1:
        for character in updated_characters:
            in_scene = '#' + character_cast_dict[character]['alternative_names']
            if len(utterance_dictionary) != 0 and in_scene in utterance_dictionary:
                additional_utterances = len(scene.find_all('sp', {'who': utterance_dictionary[in_scene]}))
            else:
                additional_utterances = 0
            num_utterances = additional_utterances + len(scene.find_all('sp', {'who': in_scene}))
            scene_info[character] = num_utterances
    else:
        scene_info[updated_characters[0]] = 1
        
    return scene_info, characters

In [7]:
def get_scene_status(scene):
    scene_str = str(scene)
    scene_desc = scene_str[scene_str.find('type='):scene_str.find('>')].replace('\"', '').split('=')[-1]
    if scene_desc.count('extra') > 0:
        scene_status = 'extra'
    else:
        scene_status = 'regular'
        
    return scene_status

In [8]:
def compare_two_scenes(cast_one, cast_two):
    """
    The function helps identify if the dramatic character cast changed.
    Params:
        cast_one - a list of characters in scene one.
        cast_two - a list of characters in scene two.
    Returns:
        no_change_scene - 'no_change_scene' if two scenes are the same, None otherwise.
    """
    if set(cast_one) == set(cast_two):
        no_change_scene = 'no_change_scene'
    else:
        no_change_scene = None
    
    return no_change_scene

In [9]:
def check_if_no_change(scene_names, scene_cast, complete_scene_info, scene_status):
    """
    The function checks if the cast for the new scene is different from the previous scene.
    Params:
        scene_names - a list of scenes names in the order they appear in the text.
        scene_cast - a list of characters in the scene.
        complete_scene_info - a dictionary where keys are scene_names and 
                                values are characters and their utterance counts.
        scene_status - whether a scene is regular or extra.
    Returns:
        scene_status - a string, updated in case to 'no_change_scene' if the character cast did not change.
    
    """
    if len (scene_names) > 0:
        # compare the current scene cast with the cast of the previous scene
        previous_cast = [name for name in complete_scene_info[scene_names[-1]].keys() 
                       if name not in ['num_speakers', 'perc_non_speakers']]
        no_change = compare_two_scenes(scene_cast, previous_cast)
        if no_change:
            scene_status = no_change   
            
    return scene_status

In [10]:
def count_characters(scene_summary_dict):
    """
    The function parses scene_summary_dict with information about number of utterances by each character
    and identifies the total number of speakers and non_speakers.
    Params:
        scene_summary_dict: a dictionary where keys are dramatic characters and values are number of utterances.
    Returns:
        num_speakers - a number of speaking dramatic characters in the scene.
        perc_non_speakers - percentage of non-speaking dramatic characters in the scene.
    """
    summary = [item for item in scene_summary_dict.items() if item[0] != 'num_utterances']
    num_speakers = len([item[0] for item in summary if item[1] != 0])
    num_non_speakers = len([item[0] for item in summary if item[1] == 0]) 
    perc_non_speakers =  round((num_non_speakers / len(summary)) * 100, 3)
    
    return num_speakers, perc_non_speakers

In [11]:
def parse_scenes(scenes, character_cast_dictionary):
    """
    The function goes through a list of scenes and updates complete_scene_info dictionary with informtion
    about each scene speaking characters, their utterance counts, and percentage of non-speaking characters.
    Params:
        scenes - a list scenes.
        name_pattern - regex expression for identifying character names.
        character_cast_dictionary, reverse_character_cast - dictionaries for lookup of alternative names 
                                                            for each dramatic character.
    Returns:
        complete_scene_info - a dictionary where keys are scenes and values are dramatic characters and their 
                             utternace counts as well as the number of speakers and percentage of non-speakers.
    """
    
    complete_scene_info = {} 
    scene_names = []
    sc_num = 0
    extra_scene_number = 1
    for scene in scenes:
        scene_status, sc_num, extra_scene_number = handle_scene_name_and_count(scene, sc_num, extra_scene_number)
        if sc_num != 1 :
            previous_cast = [name for name in complete_scene_info[scene_names[-1]].keys()
                            if name not in ['num_speakers', 'perc_non_speakers', 'num_utterances']]
        else:
            previous_cast = []
        
        scene_summary, scene_cast = count_utterances(scene, character_cast_dictionary, previous_cast, scene_status)
        scene_summary['num_utterances'] = sum(list(scene_summary.values()))
        scene_summary['num_speakers'], scene_summary['perc_non_speakers'] = count_characters(scene_summary)
        scene_status = check_if_no_change(scene_names, scene_cast, complete_scene_info, scene_status)           
        #check to make sure all character names are in scene cast as they appear in the play cast
        scene_names.append(str(sc_num)+'_'+str(scene_status))
        complete_scene_info[str(sc_num)+'_'+str(scene_status)] =  scene_summary
    
    return complete_scene_info

In [12]:
def process_summary(soup, character_cast_dictionary):  
    act_info = {}
    acts = soup.find_all('div', {'type': 'act'})
    for act_num, act in enumerate(acts, 1):
        scenes = act.find_all('div', {'type': ['scene', 'extra_scene', 'complex_scene']})
        act_info['act'+'_'+str(act_num)] = parse_scenes(scenes, 
                                                        character_cast_dictionary)

    return act_info

In [13]:
def number_present_characters(play_dictionary):
    """
    The function calculates the number of characters present in the play. If a character is listed in cast, but doesn't
    appear on stage, he/she doesn't count.
    Params:
        play_dictionary - a dictioanry with data for the play, which includes the characters present in each scene.
    Returns:
        total_number_present_characters - int.
    """
    all_present_characters = set()
    for key in play_dictionary['play_summary'].keys():
        for scene in play_dictionary['play_summary'][key]:
            for item in play_dictionary['play_summary'][key][scene].keys():
                if item != 'num_utterances' and item != 'num_speakers' and item != 'perc_non_speakers':
                    all_present_characters.add(item)
    total_number_present_characters = 0
    appearing_on_stage = set(play_dictionary['characters']).intersection(all_present_characters)

    for character in appearing_on_stage: 
        coll_number = play_dictionary['characters'][character]['collective_number']
        # if there is a collective number for this character
        if coll_number:
            total_number_present_characters += int(coll_number)
        else:
            total_number_present_characters += 1

    return total_number_present_characters

In [14]:
def estimate_number_scenes(scene_summary):
    """
    The function calcualtes the number of scenes per text and per Iarkho (i.e., as marked by actual dramatic character
    entrances and exits).
    Params:
        scene_summary - a dictionary output of the parse_play function.
    Returns:
        total_number_scenes_per_text - number of scenes as they are printed
        total_number_scenes_iarkho - number of scnes per Iarkho, which he calls mobility coefficient (MC)
    """
    total_number_scenes_per_text = 0
    total_number_scenes_iarkho = 0
    for key in scene_summary.keys():
        # get the number of scenes as it is printed in the text
        total_number_scenes_per_text+=len([scene for scene in scene_summary[key].keys() if scene.count('regular')>0])
        # count scenes as marked by actual entrances and exits                                  
        total_number_scenes_iarkho+=len([scene for scene in scene_summary[key].keys() if scene.count('no_change')==0])
    
    return total_number_scenes_per_text, total_number_scenes_iarkho

In [15]:
def sigma_iarkho(variants, weights):  
    """ 
    The function allows calculating standard range following iarkho's procedure. 
    Parameters: 
        variants - a list with distinct variants in the ascending order, e.g. [1, 2, 3, 4, 5] 
        weights - a list of weights corresponding to these variants, e.g. [20, 32, 18, 9, 1] 
    Returns: 
        sigma - standard range per iarkho 
    """  
    weighted_mean_variants = np.average(variants, weights=weights)  
    differences_squared = [(variant - weighted_mean_variants)**2 for variant in variants] 
    weighted_mean_difference = np.average(differences_squared, weights=weights)  
    sigma = weighted_mean_difference**0.5  
      
    return sigma 

In [16]:
def speech_distribution_iarkho(play_summary):
    """
    The function creates speech distrubution per Iarkho, i.e., the number of speaking characters by number of scenes.
    Params:
        play_summary - a dictionary output by parse_play function.
    Returns:
        speech_distribution - a list of tuples were the 0 element is the number of speaking characters
                              and the 1 element is the number of scenes with such number of speaking characters.
    """
    speakers = []
    perc_non_speakers = []
    for key in play_summary.keys():
        for scene in play_summary[key]:
            speakers.append((play_summary[key][scene]['num_speakers']))
            perc_non_speakers.append(round(play_summary[key][scene]['perc_non_speakers'], 3))
    counter = Counter
    counted = counter(speakers)
    speech_distribution = sorted(counted.items(), key=lambda pair: pair[0], reverse=False)
    speech_types = percentage_of_each_speech_type(speech_distribution)
    av_perc_non_speakers = round(np.mean((perc_non_speakers)), 3)
    
    return speech_distribution, speech_types, av_perc_non_speakers

In [17]:
def estimate_number_scenes(scene_summary):
    """
    The function calcualtes the number of scenes per text and per Iarkho (i.e., as marked by actual dramatic character
    entrances and exits).
    Params:
        scene_summary - a dictionary output of the parse_play function.
    Returns:
        total_number_scenes_per_text - number of scenes as they are printed
        total_number_scenes_iarkho - number of scnes per Iarkho, which he calls mobility coefficient (MC)
    """
    total_number_scenes_per_text = 0
    total_number_scenes_iarkho = 0
    for key in scene_summary.keys():
        # get the number of scenes as it is printed in the text
        total_number_scenes_per_text+=len([scene for scene in scene_summary[key].keys() if scene.count('regular')>0])
        # count scenes as marked by actual entrances and exits                                  
        total_number_scenes_iarkho+=len([scene for scene in scene_summary[key].keys() if scene.count('no_change')==0])
    
    return total_number_scenes_per_text, total_number_scenes_iarkho

In [18]:
def process_speakers_features(soup, play_data, metadata_dict):
    """
    Iarkho's features described in Iarkho's work on the evolution of 5-act tragedy in verse.
    """
    metadata_dict['num_present_characters'] = number_present_characters(play_data)
    metadata_dict['num_scenes_text'] = estimate_number_scenes(play_data['play_summary'])[0]
    metadata_dict['num_scenes_iarkho'] = estimate_number_scenes(play_data['play_summary'])[1]
    metadata_dict['speech_distribution'] = speech_distribution_iarkho(play_data['play_summary'])[0]
    metadata_dict['percentage_monologues'] = speech_distribution_iarkho(play_data['play_summary'])[1]['perc_monologue']
    metadata_dict['percentage_duologues'] = speech_distribution_iarkho(play_data['play_summary'])[1]['perc_duologue']
    metadata_dict['percentage_non_duologues'] = speech_distribution_iarkho(play_data['play_summary'])[1]['perc_non_duologue']
    metadata_dict['percentage_above_two_speakers'] = speech_distribution_iarkho(play_data['play_summary'])[1]['perc_over_two_speakers']
    metadata_dict['av_percentage_non_speakers'] = speech_distribution_iarkho(play_data['play_summary'])[2]
    metadata_dict['sigma_iarkho'] = round(sigma_iarkho(
                                    [item[0] for item in metadata_dict['speech_distribution']],
                                    [item[1] for item in metadata_dict['speech_distribution']]), 3)
    
    return metadata_dict

In [19]:
def percentage_of_each_speech_type(speech_distribution):
    """
    The function calculates the percentage of each speech type (monologue, duologue, non-duologue (meaning not two
    speakers), and over-two speakers) of the total accross all speech types.
    Params:
        speech_distibution - number of scenes with a specified number of speakers.
    Returns:
        speech_types - a dictionary with percentages corresponding to each speech type.
    """
    speech_types = {}
    total_scenes = np.sum([speech_type[1] for speech_type in  speech_distribution])
    speech_types['perc_monologue'] = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                                    if speech_type[0] ==1]) / total_scenes) *100, 2)
    speech_types['perc_duologue'] = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                                    if speech_type[0] == 2])/ total_scenes) * 100, 2)
    speech_types['perc_non_duologue'] = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                                        if speech_type[0] != 2])/ total_scenes) * 100, 2)
    speech_types['perc_over_two_speakers'] = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                                             if speech_type[0] > 2])/ total_scenes) * 100, 2)
    
    return speech_types

In [20]:
def total_utterances(play_soup):
    """
    The function parses the dictionary with play_summary produced by parse_play function
    and outputs total number of utterances 
    Params:
        play_summary - a dictionary output by parse_play function.
        
    Returns:
        total_utterances_in_play - total number of utterances in a play.
    """
    total_utterances_in_play = len(play_soup.find_all('sp'))       
    
    return total_utterances_in_play

In [21]:
def count_all_verse_lines(soup):
    all_lines = soup.find_all('l')
    not_init = soup.find_all('l', {"part": "M"}) + soup.find_all('l', {"part": "F"})
    num_verse_lines = len([line for line in all_lines if line not in not_init])
    
    return num_verse_lines

In [22]:
def verse_split_between_scenes(soup):
    scenes = soup.find_all('div', {'type': ['scene', 'extra_scene', 'complex_scene']})
    counts = {'scenes_with_split_verse':0, 'scenes_split_rhymes':0, 'both':0, 'open': 0}
    for scene in scenes:
        last_ten_lines = str(scene.find_all('l')[-10:])
        last_line = str(scene.find_all('l')[-1])
        verse = last_line[last_line.find("\""):last_line.find('>')].replace('\"', '').split(' ')[0]
        if verse.count('M') > 0 or verse.count('I') > 0:
            counts['scenes_with_split_verse'] += 1
        if last_ten_lines.count('interscene') > 0:
            counts['scenes_split_rhymes'] += 1
        if (verse.count('M') > 0 or verse.count('I') > 0) and last_ten_lines.count('interscene') > 0:
            counts['both'] += 1
        if verse.count('M') > 0 or verse.count('I') > 0 or last_ten_lines.count('interscene') > 0:
            counts['open'] += 1
            
    counts['percentage_scene_split_verse'] = round((counts['scenes_with_split_verse'] / len(scenes)) * 100, 3)
    counts['percentage_scene_rhymes'] = round((counts['scenes_split_rhymes'] / len(scenes)) * 100, 3)
    counts['percentage_scenes_rhymes_split_verse'] = round((counts['both'] / len(scenes)) * 100, 3)
    counts['percentage_open_scenes'] = round((counts['open'] / len(scenes)) * 100, 3)
    
    return counts

In [23]:
def process_features_verse(play_soup, play_data, metadata_dict):
    """
    Iarkho's features described in the work on Corneille's comedies and tragedies.
    """
    metadata_dict['total_utterances'] = total_utterances(play_soup)
    metadata_dict['num_verse_lines'] = count_all_verse_lines(play_soup)
    metadata_dict['dialogue_vivacity'] = round(
                                         metadata_dict['total_utterances'] / 
                                         metadata_dict['num_verse_lines'], 3)
    metadata_dict['num_scenes_with_split_verse_lines'] = verse_split_between_scenes(
                                                         play_soup)['scenes_with_split_verse']
    metadata_dict['num_scenes_with_split_rhymes'] = verse_split_between_scenes(
                                                    play_soup)['scenes_split_rhymes']
    metadata_dict['percentage_scene_split_verse'] = verse_split_between_scenes(
                                                    play_soup)['percentage_scene_split_verse']
    metadata_dict['percentage_scene_split_rhymes'] = verse_split_between_scenes(
                                                    play_soup)['percentage_scene_rhymes']
    metadata_dict['num_scenes_with_split_rhymes_verses'] = verse_split_between_scenes(
                                                           play_soup)['both']
    metadata_dict['num_open_scenes'] = verse_split_between_scenes(
                                       play_soup)['open']
    metadata_dict['percentage_open_scenes'] = verse_split_between_scenes(
                                              play_soup)['percentage_open_scenes']
    metadata_dict['percentage_scenes_rhymes_split_verse'] = verse_split_between_scenes(
                                                            play_soup)['percentage_scenes_rhymes_split_verse']
    
    return metadata_dict

In [24]:
def splitting_verse_line(scene):
    splits= re.split('<l>|<l part="I">', scene)
    
    return splits

In [25]:
def estimate_verse_line_splitting_stage_directions(play_soup):
    splits = splitting_verse_line(str(play_soup))
    total_num = 0
    for line in splits[1:]:
        # find the index of the end of the verse line
        end = [i for i in re.finditer(r'</l>', line)][-1].span()[0]
        verse_line = line[:end]
        if verse_line.count('</stage>')> 0:
            total_num+=verse_line.count('</stage>')
            
    return total_num

In [26]:
def count_number_word_tokens(play_soup):
    stage_directions = play_soup.find_all('stage')
    total_number_tokens = 0
    for sd in stage_directions:
        sd = sd.get_text()
        for punct in string.punctuation+'stage'+'\n':
            sd = sd.replace(punct, '')
        total_number_tokens += len(sd.split())

    return total_number_tokens

In [27]:
def process_stage_directions_features(play_soup, play_data, metadata_dict):
    """
    Sperantov's stage-directions features
    """
    metadata_dict['num_stage_directions'] = len(play_soup.find_all('stage'))
    metadata_dict['stage_directions_frequency'] = round((metadata_dict['num_stage_directions'] /
                                                  metadata_dict['num_verse_lines']) * 100, 3)
    metadata_dict['num_word_tokens_in_stage_directions'] = count_number_word_tokens(play_soup)
    metadata_dict['average_length_of_stage_direction'] = round(metadata_dict['num_word_tokens_in_stage_directions']/
                                                        metadata_dict['num_stage_directions'], 3)
    metadata_dict['num_verse_splitting_stage_directions'] = estimate_verse_line_splitting_stage_directions(play_soup)
    metadata_dict['degree_of_verse_prose_interaction'] = round((metadata_dict['num_verse_splitting_stage_directions'] /
                                                        metadata_dict['num_verse_lines']) * 100, 3)
    
    return metadata_dict

In [28]:
def add_play_info(metadata, custom_flag=False):
    """
    Update play metadata from the metadata_df. We can provide our own metadata or use the TEI metadataa
    """
    play_data = {}
    if custom_flag:
        play_data['title'] = metadata[0][0]
        play_data['author'] = metadata[0][1] + ', ' + metadata[0][2]
        play_data['creation_date'] = metadata[0][3]
    else:
        play_data['title'] = soup.find_all('title', {'type':'main'})[0].get_text()
        play_data['author'] = soup.find_all('author')[0].get_text()
        play_data['creation_date'] = int(soup.find_all('date', 
                                     {'type':'written'})[0].get_text().split()[0].replace('\"', ''))
    
    return play_data

In [29]:
def percentage_of_scenes_discont_change(play_soup, play_data, metadata_dict):
    number_scenes = metadata_dict['num_scenes_iarkho']
    characters = []
    num_scenes_with_disc_character_change = 0
    for act in play_data['play_summary'].keys():
        for entry in play_data['play_summary'][act].values():
            new_cast = [item for item in entry.keys() if 
                               item not in ['num_speakers', 'perc_non_speakers', 'num_utterances']]
            if len(characters) > 0:
                if len(set(new_cast).intersection(set(characters[-1]))) == 0:
                    num_scenes_with_disc_character_change += 1
            characters.append(new_cast)
    perc_disc = round((num_scenes_with_disc_character_change /number_scenes) * 100, 3) 
    metadata_dict['number_scenes_with_discontinuous_change_characters'] = num_scenes_with_disc_character_change
    metadata_dict['percentage_scenes_with_discontinuous_change_characters'] = perc_disc
    
    return metadata_dict

In [30]:
def additional_metadata(play_soup, play_data):
    """
    Process all play features in stages
    """
    metadata_dict = {}
    for process in [process_speakers_features, process_features_verse, 
                   process_stage_directions_features, percentage_of_scenes_discont_change]:
        metadata_dict = process(play_soup, play_data, metadata_dict)

    return metadata_dict

In [31]:
def process_play(file_name, metadata_df, custom_flag):
    """
    The function parses a txt file and creates a summary with features and metadata for the play.
    Params:
        file_name - a string, name of the file with the play text.
        metadata_df - a dataframe containing the info about the play.
    Returns:
        play_data - a dictionary with detailed play summary by scenes, metadata, and features
    """
    print(file_name)
    with open(file_name, 'r') as file:
        soup = bs(file, 'lxml')
    if custom_flag:
        play_index = file_name.replace('TEI_files/', '').replace('.xml', '')
        play_meta = metadata_df[metadata_df['index']==play_index][['title', 'last_name', 
                                                               'first_name', 'creation_date']].values 
        comedy = open(file_name, 'r') .read()
        number_acts = int(metadata_df[metadata_df['index']==play_index]['num_acts'].values[0])
    else:
        play_meta = []
    play_data = add_play_info(play_meta, custom_flag)
    play_data['characters'] = create_character_cast(soup)
    play_data['play_summary'] = process_summary(soup, play_data['characters'])
    play_data['metadata'] = additional_metadata(soup, play_data)
    
    return play_data

In [32]:
def process_all_plays(input_directory, output_path, custom_flag=False, metadata_path=None):
    """
    The function allows to process all files in a specified directory.
    Params:
        input_directory - the path to the folder containing the txt files
        output_path - directory in which the json summaries will be saved.
        metadata_path - path to the metadata file, a tab-delimited txt file with informtion about all plays.
    Returns:
        no returns, the files will be saved in output_path directory.
    """
    all_files = [f for f in listdir(input_directory) if f.count('.xml')>0]
    if custom_flag:
        metadata_df = pd.read_csv(metadata_path, sep='\t')
    else:
        metadata_df = pd.DataFrame()
    for file in all_files:
        play_data_dict = process_play(input_directory+file, metadata_df, custom_flag)
        json_name = output_path +str(file.replace('.xml', '.json')) 
        with open(json_name, 'w') as fp:
            json.dump(play_data_dict, fp, ensure_ascii=False, indent=2)

In [33]:
process_play('TEI_files/R_18.xml', pd.read_csv('Russian_Comedies.txt', sep='\t'), True)

TEI_files/R_18.xml


{'title': 'Gore ot uma',
 'author': 'Aleksandr, Griboedov',
 'creation_date': 1824,
 'characters': {'Лиза': {'alternative_names': 'lizanka',
   'collective_number': None},
  'София': {'alternative_names': 'sofija', 'collective_number': None},
  'Фамусов': {'alternative_names': 'famusov', 'collective_number': None},
  'Молчалин': {'alternative_names': 'molchalin', 'collective_number': None},
  'Слуга 1-6': {'alternative_names': 'sluga_1-6', 'collective_number': None},
  'Чацкий': {'alternative_names': 'chatskij', 'collective_number': None},
  'Слуга 2-3': {'alternative_names': 'sluga_2-3', 'collective_number': None},
  'Скалозуб': {'alternative_names': 'skalozub', 'collective_number': None},
  'Главный слуга': {'alternative_names': 'glavnij_sluga',
   'collective_number': None},
  'Наталья Дмитриевна': {'alternative_names': 'natalja_dmitrievna',
   'collective_number': None},
  'Платон Михайлович': {'alternative_names': 'platon_mihajlovich',
   'collective_number': None},
  '1-я княжна'

In [34]:
process_all_plays('TEI_files/', 'Test/', True, 'Russian_Comedies.txt')

TEI_files/R_22.xml
TEI_files/R_18.xml
TEI_files/R_16.xml
TEI_files/R_17.xml
TEI_files/R_13.xml
TEI_files/R_10.xml
TEI_files/R_6.xml
TEI_files/R_1.xml
TEI_files/R_3.xml
TEI_files/R_2.xml
