This notebook contains the pipeline for extracting features from the Russian comedies in the .txt format.

In [1]:
import os
import pandas as pd
from os import listdir
from os import path
from os.path import isfile, join
import re
import numpy as np
import string 
from collections import Counter
import json

In [2]:
metadata_file = pd.read_csv('../Russian_Comedies.txt', sep='\t')

In [3]:
metadata_file.head()

Unnamed: 0,Index,Title,Last_Name,First_Name,Date,Translation,Number_of_Acts,URL,original_file_format,derived_format
0,R_1,Samoliubivyi stikhotvorets,Nikolev,Nikolai,1775,0,5.0,https://dracor.org/api/corpora/rus/play/nikole...,TEI,
1,R_2,Khvastun,Kniazhnin,Iakov,1785,0,5.0,https://dracor.org/api/corpora/rus/play/knyazh...,TEI,
2,R_3,Chudaki,Kniazhnin,Iakov,1790,0,5.0,https://dracor.org/api/corpora/rus/play/knyazh...,TEI,
3,R_4,Prestupnik ot igry ili bratom prodannaia sestra,Efim’ev,Dmitrii,1788,0,5.0,,pdf,txt
4,R_5,Smekh i gore,Klushin,Aleksandr,1792,0,5.0,http://az.lib.ru/k/klushin_a_i/text_0030.shtml,,txt


In [66]:
def split_text(play_file, old_ortho_flag=True):
    if old_ortho_flag:
        acting_characters = 'ДѢЙСТВУЮЩІЕ'
        act = 'ДѢЙСТВІЕ'
    else:
        acting_characters = 'ДЕЙСТВУЮЩИЕ'
        act = 'ДЕЙСТВИЕ'
        
    cast_start_index = play_file.find('{} ЛИЦА'.format(acting_characters)) 
    if cast_start_index == -1:
        cast_start_index = play_file.find('{} <ЛИЦА>'.format(acting_characters))
        if cast_start_index == -1:
            cast_start_index = play_file.find('<{}> ЛИЦА'.format(acting_characters))
            if cast_start_index == -1:
                cast_start_index = play_file.find('ДѢЙСТВУЮЩIЯ ЛИЦА'.format(acting_characters))
    cast_end_index = play_file.find(act)
    cast_text = play_file[cast_start_index:cast_end_index].split('ЛИЦА')[1]
    play_text = play_file[cast_end_index:]
    
    return cast_text, play_text

In [5]:
def identify_character_names(line):
    """
    The function identifies which character names are present in the string
    Params:
        line - each line from string from the text with charcters (split at '\n')
    Returns:
        characters - a list of character names or 0 if not present in that line
    """
    pattern = r'[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+ [А-Я+Ѣ+І] |[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+'
    characters = [name.group() 
                 for name in re.finditer(pattern, 
                 line)]
    if len(characters) > 0:
        return characters
    else:
        return 0

In [6]:
def handle_alternative_names(line):
    cast_dictionary = {}
    character_name = identify_character_names(line)[0].strip()
    alternative_names = line[line.find('<alternative_name')+len('<alternative_name'):
                            line.find('>')].strip().split(', ')
    
    return character_name, alternative_names

In [7]:
def make_reverse_dictionary(cast_dictionary):
    """
    The function allows us to reverse a cast dictionary, so that we could look up the character names in reverse, e.g.
    from 'COUNT' to 'COUNT VIAZEMSKII'.
    Params:
        cast_dictionry -  dictionary where keys are character names as they appear in the cast list and 
                          values are character names as they appear in the play text.
    Returns:
        reverse_cast_dictionary - the dictionary with keys and values reversed.
    """
    reverse_cast_dictionary = {}
    for item in cast_dictionary.items():
        if item[1]['alternative_names']:
            for name in item[1]['alternative_names']:
                reverse_cast_dictionary[name] = item[0]
            
    return reverse_cast_dictionary

In [8]:
def get_collective_number(line):
    collective_dictionary = {}
    name = identify_character_names(line)[0]
    match = re.findall(r'<collective_number \d>', line)
    collective_number = match[-1].split(' ')[-1]
    
    return name, collective_number.replace('>', '')

In [9]:
def identify_character_cast(cast_string):
    cast = {}
    for line in cast_string.split('\n'):
        characters = identify_character_names(line)
        if characters != 0:
            if line.find('<alternative_name')!= -1 and line.find('<collective_number ') != -1:
                character_name, alternative_name  = handle_alternative_names(line)
                _, collective_number = get_collective_number(line)
                cast[character_name] = {'alternative_names': alternative_name, 
                                            'collective_number': collective_number}
            elif line.find('<alternative_name')!= -1:
                character_name, alternative_name  = handle_alternative_names(line)
                cast[character_name] = {'alternative_names':alternative_name,
                                             'collective_number': None}
            elif line.find('<collective_number ') != -1:
                character_name, collective_number = get_collective_number(line)
                cast[character_name] = {'alternative_names': None,
                                        'collective_number': collective_number}
                
            else:
                cast[characters[0]] =  {'alternative_names': None,
                                        'collective_number': None}

    return cast

In [10]:
def split_a_scene(scene_string):
    starting_with_cast = scene_string[scene_string.find('<cast '):]
    scene_itself = starting_with_cast.replace(starting_with_cast[:starting_with_cast.find('>')]+'>','') 
    scene_cast = starting_with_cast[:starting_with_cast.find('>')].replace('<cast ', '').strip().split(', ')
    
    return scene_cast, scene_itself

In [11]:
def quality_check_cast(cast_list, character_cast_dictionary, reverse_character_cast):
    for name in cast_list:
        if name not in character_cast_dictionary:
            if name in reverse_character_cast:
                print('Name found in reverse dictionary', name)

In [12]:
def get_scene_status(scene):
    if scene.count('SCENE>')== 0 and scene.count('<no_change_SCENE>')==0:
        scene_status = 'regular'
    elif scene.count('<no_change_SCENE>')>0:
        scene_status = 'no_change'
    elif scene.count('SCENE>')>0 and scene.count('<no_change_SCENE>')==0:
        scene_status = 'extra'
    
    return scene_status

In [13]:
def count_utterances(scene_cast, utterances, character_cast_dictionary, reverse_character_cast):
    scene_info = {}
    # if there is only one character in a scene, he will have one utterance
    if len(scene_cast) == 1:
            scene_info[scene_cast[0]] = 1
    else:
        for name in scene_cast:
            utterance_count = utterances.count(name)
            if utterance_count != 0:
                scene_info[name] = utterance_count
            # in case the character appears in the text under a different name
            else:
                try:
                    alt_names = character_cast_dictionary[name]['alternative_names']
                except KeyError:
                    alt_names = character_cast_dictionary[reverse_character_cast[name]]['alternative_names']
                #in case there are alternative names
                if alt_names:
                    # there may be a few alternative names associated with a character, 
                    # make sure they are all counted as one
                    speaker_total = 0
                    for alt in alt_names: 
                        try:
                            speaker_total+=utterances.count(alt)
                        except KeyError:
                            speaker_total+=utterances.count(reverse_character_cast[alt])
                    scene_info[name] = speaker_total
                else:
                    scene_info[name] = utterances.count(name)
                    
    return scene_info                 

In [14]:
def compare_two_scenes(cast_one, cast_two):
    if set(cast_one) == set(cast_two):
        no_change_scene = 'no_change_scene'
    else:
        no_change_scene = None
    
    return no_change_scene

In [15]:
def count_speaking_characters(scene_summary_dict, scene_cast):
    num_speakers = len([item[0] for item in scene_summary_dict.items() if item[1] != 0 and item[0] in scene_cast])
    
    return num_speakers

In [62]:
def parse_scenes(scenes, name_pattern, character_cast_dictionary, reverse_character_cast):
    sc_num = 0
    complete_scene_info = {} 
    scene_names = []
    extra_scene_number = 1
    statuses = []
    for scene in scenes:
        # add this logic to account for issues caused by extra scenes tracking
        sc_num = int(float(sc_num))
        scene_status = get_scene_status(scene)
        if scene_status == 'extra':
            sc_num = str(sc_num)+ '.'+str(extra_scene_number)
            extra_scene_number+=1
        else:
            sc_num +=1
            extra_scene_number = 1
        # split a scene string into two substrings, one with cast, the other - without
        scene_cast, scene_itself = split_a_scene(scene)
        # check if the cast for the new scene is different from the previous scene
        if len (scene_names) >0:
            no_change = compare_two_scenes(scene_cast, complete_scene_info[scene_names[-1]])
            if no_change:
                scene_status = no_change            
        #check to make sure all character names are in scene cast as they appear in the play cast
        quality_check_cast(scene_cast, character_cast_dictionary, reverse_character_cast)
        scene_names.append(str(sc_num)+'_'+str(scene_status))
        utterances =  [name.group().strip() 
                      for name in re.finditer(name_pattern, scene_itself)]
        scene_summary = count_utterances(scene_cast, 
                                         utterances,
                                         character_cast_dictionary, 
                                         reverse_character_cast)
        scene_summary['num_utterances'] = sum(list(scene_summary.values()))
        scene_summary['num_speakers'] = count_speaking_characters(scene_summary, scene_cast)
        complete_scene_info[str(sc_num)+'_'+str(scene_status)] =  scene_summary
    
    return complete_scene_info

In [17]:
def parse_play(play_text, name_pattern, number_acts, old_ortho_flag, 
               character_cast_dictionary, reverse_character_cast):
    if old_ortho_flag == True:
        rus_act = 'ДѢЙСТВІЕ'
        rus_scene = 'ЯВЛЕНІЕ'
    else:
        rus_act = 'ДЕЙСТВІЕ'
        rus_scene = 'ЯВЛЕНИЕ'
    acts = play_text.split(rus_act)[1:]
    if len(acts)!= number_acts:
        print('The number of acts is not {}.'.format(number_acts))
    else:
        act_info = {}
        for act_num, act in enumerate(acts, 1):
            scenes = re.split('{}|<extra'.format(rus_scene),act)[1:]
            act_info['act'+'_'+str(act_num)] = parse_scenes(scenes, 
                                                            name_pattern, 
                                                            character_cast_dictionary,
                                                            reverse_character_cast)
        
        return act_info

In [18]:
def splitting_verse_line(scene):
    splits= re.split('<end_verse_line>|<end_verse_line_interscene_rhyme>', scene)
    
    return splits

In [19]:
def remove_numbers(input_string):
    if input_string.isalpha() is False:
        numbers = re.findall('\d+',input_string)
        for num in numbers:
            input_string = input_string.replace(num, '')
            
    return input_string

In [20]:
def replace_tags(line):
    for _ in range(line.count('<stage>')):
        stage_direction_text = line[line.find('<stage>')+len('<stage>'):line.find('</stage>')]
        line = line.replace('<stage>'+stage_direction_text+'</stage>', ' STAGE ')
        
    return line

In [21]:
def clean_stage_direction(line):
    entities = re.findall(r'ЯВЛЕНІЕ +\w+|ЯВЛЕНИЕ +\w+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+', line)
    punctuation = [symb for symb in string.punctuation + '—' + '\n' + '\t']
    tags = ['extra_SCENE', 'cast', 'no_change_SCENE', 'intermedia', 'stage separator',
            'speaker_clarification', 'speaking_character_no_utterance']              
    for item in tags + punctuation + entities:
        line = line.replace(item, ' ')
        
    return line

In [22]:
def remove_start_end_stage_directions(verse_line):
    verse_line = verse_line.strip()
    while verse_line[:5]=='STAGE' or verse_line[-5:]=='STAGE':
        if verse_line[-5:]=='STAGE':
            verse_line = verse_line[:-5].strip()
        elif verse_line[:5]=='STAGE':
            verse_line = verse_line[5:].strip()
    return verse_line

In [23]:
def estimate_verse_line_splitting_stage_directions(text_string):
    # split verse lines
    splits = splitting_verse_line(text_string)
    number_splitting_stage_directions = 0
    for split in splits:
        # remove any numbers that could appear in the string
        line = remove_start_end_stage_directions(
                                                clean_stage_direction(
                                                replace_tags(
                                                remove_numbers(split))))
        number_splitting_stage_directions+= line.count('STAGE')
        #print('start_line', line, line.count('STAGE'))
        
    return number_splitting_stage_directions

In [24]:
def count_number_word_tokens(play_text):
    total_number_word_tokens = 0
    indices = []
    for index_pair in zip([i.span()[1] for i in re.finditer(r'<stage>', play_text)], 
                   [i.span()[0] for i in re.finditer(r'</stage>', play_text)]):
        indices.append(index_pair)
    for index_pair in indices:
        stage_direction = play_text[index_pair[0]:index_pair[1]]
        punctuation = [symb for symb in string.punctuation + '—' + '\n' + '\t']
        for punct in punctuation:
            stage_direction = stage_direction.replace(punct, '')
        num_tokens = len(stage_direction.strip().split(' '))
        total_number_word_tokens += num_tokens   
        
    return total_number_word_tokens

In [25]:
def check_end_of_scene(scene_string):
    if scene_string[-16:] != '<end_verse_line>': 
        if scene_string[-33:] != '<end_verse_line_interscene_rhyme>':
            return True
        else:
            return False
    else:
        return False

In [26]:
def tackle_alternative_scene(play_text):
    play_text = play_text.replace('<ЯВЛЕНІЕ>', 'ЯВЛЕНІЕ')
    play_text = play_text.replace('<ЯВЛЕНИЕ>', 'ЯВЛЕНИЕ')
    
    return play_text

In [27]:
def verse_split_between_scenes(play_text, old_ortho_flag=True):
    if old_ortho_flag:
        rus_scene = 'ЯВЛЕНІЕ'
    else:
        rus_scene = 'ЯВЛЕНИЕ'
    # make sure the scenes with alternative mark up (where scenes are not using the word ЯВЛЕНІЕ in the text)
    play_text = tackle_alternative_scene(play_text)
    scenes_split_verses = 0
    scenes_rhymes = 0
    both = 0
    scenes = re.split('{}|<extra'.format(rus_scene),play_text)
    for scene in scenes[1:]:
        if scene.count('<end_verse_line_interscene_rhyme>') > 0:
            scenes_rhymes +=1
        scene_cleaned = replace_tags(remove_numbers(scene)).strip()
        if check_end_of_scene(scene_cleaned):
                entities = re.findall(r'ЯВЛЕНІЕ +\w+|ЯВЛЕНИЕ +\w+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+', scene)
                symbols = [symb for symb in string.punctuation + '—' + '\n' + '\t' 
                           if symb not in ['_', '<', '>']] + ['STAGE'] + entities
                for symbol in symbols:
                    scene_cleaned = scene_cleaned.replace(symbol, '').strip()
                if check_end_of_scene(scene_cleaned):
                    scenes_split_verses+=1
                    #print(scene_cleaned[-33:])
                    if scene_cleaned.count('<end_verse_line_interscene_rhyme>') > 0:
                        both+=1
    return scenes_split_verses, scenes_rhymes, both

In [28]:
def estimate_number_scenes(scene_summary):
    """
    The function calcualtes the number of scenes per text and per Iarkho (i.e., as marked by actual dramatic character
    entrances and exits).
    Params:
        scene_summary - a dictionary output of the parse_play function.
    Returns:
        total_number_scenes_per_text - number of scenes as they are printed
        total_number_scenes_iarkho - number of scnes per Iarkho, which he calls mobility coefficient (MC)
    """
    total_number_scenes_per_text = 0
    total_number_scenes_iarkho = 0
    for key in scene_summary.keys():
        # get the number of scenes as it is printed in the text
        total_number_scenes_per_text+=len([scene for scene in scene_summary[key].keys() if scene.count('regular')>0])
        # count scenes as marked by actual entrances and exits                                  
        total_number_scenes_iarkho+=len([scene for scene in scene_summary[key].keys() if scene.count('no_change')==0])
    
    return total_number_scenes_per_text, total_number_scenes_iarkho

In [29]:
def sigma_iarkho(variants, weights):  
    """ 
    The function allows calculating standard range following iarkho's procedure. 
    Parameters: 
        variants - a list with distinct variants in the ascending order, e.g. [1, 2, 3, 4, 5] 
        weights - a list of weights corresponding to these variants, e.g. [20, 32, 18, 9, 1] 
    Returns: 
        sigma - standard range per iarkho 
    """  
    weighted_mean_variants = np.average(variants, weights=weights)  
    differences_squared = [(variant - weighted_mean_variants)**2 for variant in variants] 
    weighted_mean_difference = np.average(differences_squared, weights=weights)  
    sigma = weighted_mean_difference**0.5  
      
    return sigma 


In [30]:
def parse_play_summary(play_summary):
    """
    The function parses the dictionary with play_summary produced by parse_play function
    and outputs total number of utterances and total number of non-duologues
    Params:
        play_summary - a dictionary output by parse_play function.
        
    Returns:
        total_utterances_in_play - total number of utterances in a play.
        total_non_duologues - total number of scenes with non-duologues (i.e., less or more than two speaking
        characters)
    """
    total_utterances_in_play = 0
    total_non_duologues = 0
    for key in play_summary.keys():
        for scene in play_summary[key].keys():
            total_utterances_in_play += play_summary[key][scene]['num_utterances']
            if play_summary[key][scene]['num_speakers'] != 2:
                total_non_duologues +=1
    
    return total_utterances_in_play, total_non_duologues

In [31]:
def number_present_characters(play_dictionary):
    all_present_characters = set()
    for key in play_dictionary['play_summary'].keys():
        for scene in play_dictionary['play_summary'][key]:
            for item in play_dictionary['play_summary'][key][scene].keys():
                if item != 'num_utterances' and item != 'num_speakers':
                    all_present_characters.add(item)
    total_number_present_characters = 0
    for character in play_dictionary['characters'].keys():
        alt_names = play_dictionary['characters'][character]['alternative_names']
        # in case there are alternative names
        if alt_names:
            possible_names = [character] + alt_names
        else:
            possible_names = [character]
        if len(set(possible_names).intersection(set(all_present_characters))) > 0:
            coll_number = play_dictionary['characters'][character]['collective_number']
            # if there is a collective number for this character
            if coll_number:
                total_number_present_characters += int(coll_number)
            else:
                total_number_present_characters += 1
                
    return total_number_present_characters

In [32]:
def speech_distribution_iarkho(play_summary):
    """
    The function creates speech distrubution per Iarkho, i.e., the number of speaking characters by number of scenes.
    Params:
        play_summary - a dictionary output by parse_play function.
    Returns:
        speech_distribution - a list of tuples were the 0 element is the number of speaking characters
                              and the 1 element is the number of scenes with such number of speaking characters.
    """
    speakers = []
    for key in play_summary.keys():
        for scene in play_summary[key]:
            speakers.append((play_summary[key][scene]['num_speakers']))
    counter = Counter
    counted = counter(speakers)
    speech_distribution = sorted(counted.items(), key=lambda pair: pair[0], reverse=False)
    
    return speech_distribution

In [33]:
def additional_metadata(play_string, play_data, old_ortho_flag):
    metadata_dict = {}
    metadata_dict['num_verse_lines'] = play_string.count('<end_verse_line>') + play_string.count('<end_verse_line_interscene_rhyme>')
    metadata_dict['num_scenes_with_split_verse_lines'] = verse_split_between_scenes(play_string, old_ortho_flag)[0]
    metadata_dict['num_scenes_with_split_rhymes'] = verse_split_between_scenes(play_string, old_ortho_flag)[1]
    metadata_dict['num_scenes_with_split_rhymes_verses'] = verse_split_between_scenes(play_string, old_ortho_flag)[2]
    metadata_dict['num_open_scenes'] = (metadata_dict['num_scenes_with_split_verse_lines'] + 
                                           metadata_dict['num_scenes_with_split_rhymes'] - 
                                           metadata_dict['num_scenes_with_split_rhymes_verses'])
    metadata_dict['num_present_characters'] = number_present_characters(play_data)
    metadata_dict['num_scenes_text'] = estimate_number_scenes(play_data['play_summary'])[0]
    metadata_dict['num_scenes_iarkho'] = estimate_number_scenes(play_data['play_summary'])[1]
    metadata_dict['total_utterances'] = parse_play_summary(play_data['play_summary'])[0]
    metadata_dict['dialogue_vivacity'] = round(metadata_dict['total_utterances'] / 
                                                       metadata_dict['num_verse_lines'], 3)
    metadata_dict['num_no_duologues'] = parse_play_summary(play_data['play_summary'])[1]
    metadata_dict['percentage_non_duologues'] = round((metadata_dict['num_no_duologues'] / 
                                                        metadata_dict['num_scenes_iarkho']) *100, 3)
    metadata_dict['percentage_scene_split_rhyme'] = round((metadata_dict['num_scenes_with_split_rhymes'] / 
                                                     metadata_dict['num_scenes_iarkho'])*100, 3)
    metadata_dict['percentage_scene_split_verse'] = round((metadata_dict['num_scenes_with_split_verse_lines'] / 
                                                     metadata_dict['num_scenes_iarkho'])*100, 3)
    metadata_dict['percentage_open_scenes'] = round((metadata_dict['num_open_scenes']/
                                                    metadata_dict['num_scenes_iarkho']) * 100, 3)
    metadata_dict['percentage_scenes_rhymes_split_verse'] = round((metadata_dict['num_scenes_with_split_rhymes_verses']/
                                                    metadata_dict['num_scenes_iarkho']) * 100, 3)
    metadata_dict['speech_distribution'] = speech_distribution_iarkho(play_data['play_summary'])
    metadata_dict['sigma_iarkho'] = round(sigma_iarkho(
                                        [item[0] for item in metadata_dict['speech_distribution']],
                                        [item[1] for item in metadata_dict['speech_distribution']]), 3)
    # stage-directions related features
    metadata_dict['num_stage_directions'] = play_string.count('<stage>')
    metadata_dict['stage_directions_frequency'] = round((metadata_dict['num_stage_directions'] /
                                                       metadata_dict['num_verse_lines']) * 100, 3)
    metadata_dict['num_word_tokens_in_stage_directions'] = count_number_word_tokens(play_string)
    metadata_dict['average_length_of_stage_direction'] = round(metadata_dict['num_word_tokens_in_stage_directions']/
                                                               metadata_dict['num_stage_directions'], 3)
    metadata_dict['num_verse_splitting_stage_directions'] = estimate_verse_line_splitting_stage_directions(play_string)
    metadata_dict['degree_of_verse_prose_interaction'] = round((metadata_dict['num_verse_splitting_stage_directions'] /
                                                              metadata_dict['num_verse_lines']) * 100, 3)
    
    return metadata_dict

In [51]:
def process_play(file_name, metadata_df):
    """
    The function parses a txt file and creates a summary with features and metadata for the play.
    Params:
        file_name - a string, name of the file with the play text.
        metadata_df - a dataframe containing the info about the play.
    Returns:
        play_data - a dictionary with detailed play summary by scenes, metadata, and features
    """
    play_meta = metadata_df[metadata_df.Index==file_name.replace(
                                    '.txt', '')][['Title', 'Last_Name', 'First_Name', 'Date']].values
    file = open(file_name, 'r') 
    expected_acts = int(metadata_df[metadata_df['Index']==file_name.replace('.txt', '')]
                        ['Number_of_Acts'].values[0])
    comedy = file.read()
    number_acts = expected_acts

        # add logic for detecting if the text is in old orthography
    if comedy.count('Ѣ') >0:
        old_ortho_flag=True
    else:
        old_ortho_flag=False
    # split the text into the part with the cast names and the play itself
    cast_text, play_text = split_text(comedy, old_ortho_flag)
    print('Character cast text', cast_text)
    character_cast_dictionary = identify_character_cast(cast_text)
    print(character_cast_dictionary)
    reverse_character_cast = make_reverse_dictionary(character_cast_dictionary)
    
    play_data = {}
    play_data['title'] = play_meta[0][0]
    play_data['author'] = play_meta[0][1] + ', ' + play_meta[0][2]
    play_data['creation_date'] = play_meta[0][3]
    print(play_meta)
    play_data['characters'] = character_cast_dictionary.copy()
    
    play_data['play_summary'] = parse_play(play_text, 
                                       '[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+\w[А-Я+Ѣ+І]|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+ [А-Я+Ѣ+І] |[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+ [А-Я+Ѣ+І]', 
                                        number_acts,
                                        old_ortho_flag,
                                        character_cast_dictionary,
                                        reverse_character_cast)
    play_data['metadata'] = additional_metadata(play_text, play_data, old_ortho_flag)
    
    return play_data

In [35]:
def process_all_plays(input_directory, output_path):
    """
    The function allows to process all files in a specified directory.
    Params:
        input_directory - the path to the folder containing the txt files
        output_path - directory in which the json summaries will be saved.
    Returns:
        no returns, the files will be saved in output_path directory.
    """
    all_files = [f for f in listdir(input_directory) if f.count('.txt')>0]
    print(all_files)
    metadata_df = pd.read_csv('../Russian_Comedies.txt', sep='\t')
    display(metadata_df.head())
    for file in all_files:
        print(file)
        play_data_dict = process_play(file, metadata_df)
        json_name = output_path +str(file.replace('.txt', '.json')) 
        with open(json_name, 'w') as fp:
            json.dump(play_data_dict, fp, ensure_ascii=False, indent=2)

In [36]:
#process_all_plays('.', '../../Play_Json_Files/')

sc_num = 0
complete_scene_info = {} 
scene_names = []
extra_scene_number = 1
statuses = []
scene = test
name_pattern = '[А-Я]+.\w[А-Я]+.\w[А-Я]+.\w[А-Я]+|[А-Я]+.\w[А-Я]+.\w[А-Я]+|[А-Я]+.\w[А-Я]+'
# add this logic to account for issues caused by extra scenes tracking
sc_num = int(float(sc_num))
scene_status = get_scene_status(scene)
if scene_status == 'extra':
    sc_num = str(sc_num)+ '.'+str(extra_scene_number)
    extra_scene_number+=1
else:
    sc_num +=1
    extra_scene_number = 1
# split a scene string into two substrings, one with cast, the other - without
scene_cast, scene_itself = split_a_scene(scene)
# check if the cast for the new scene is different from the previous scene
if len (scene_names) >0:
    no_change = compare_two_scenes(scene_cast, complete_scene_info[scene_names[-1]])
    if no_change:
        scene_status = no_change            
#check to make sure all character names are in scene cast as they appear in the play cast
quality_check_cast(scene_cast)
scene_names.append(str(sc_num)+'_'+str(scene_status))
utterances =  [name.group() 
              for name in re.finditer(name_pattern, scene_itself)]
print(utterances)
scene_summary = count_utterances(scene_cast, utterances)
scene_summary['num_utterances'] = sum(list(scene_summary.values()))
scene_summary['num_speakers'] = count_speaking_characters(scene_summary, scene_cast)
complete_scene_info[str(sc_num)+'_'+str(scene_status)] =  scene_summary

In [85]:
process_play('R_9.txt', pd.read_csv('../Russian_Comedies.txt', sep='\t'))

Character cast text :

ГРАФИНЯ ГОРДАНА, вдова изъ мѣщанской породы, <alternative_name ГОРДАНА> 
Г-НЯ МИЛАНА <alternative_name МИЛАНА>, 
Г-ФЪ ВСПЫЛЬЧИНЪ, ея ѣшй <alternative_name ВСПЫЛЬЧИНЪ> 
ЗЛОВРЕДЪ, бѣдной дворянинъ живущій въ ' ч ея домѣ.
БОГАТОНЪ, богатой дворянинъ.
КРАСАНА, 
МИЛОНЪ.
РАЗУМА, бѣдная дворянка воспитанная въ его домѣ.
ЧУВСТВИНЪ., нѣжной романической лю-
ПУСТОНЪ.
Э3ЕЛЬСКОПФЪ., Нѣмецъ Лѣкарь.
ПАНДАРЪ, французъ дворецкой , 
ДОГАДА, служанка , } Горданы.
ПРОСТАНА,служанка Красаны.
ТУЗИМЪ, слуга Зловреда.
бовникъ.
МУЗЫКАНТЫ <alternative_name IЙ МУЗЫКАНТЪ> <collective_number 1> , 
СЛУГИ <alternative_name СЛУГА, СЛУГА ГОРДАНЫ> <collective_number 1> и 
СЛУЖАНКИ ГОРДАНЫ <alternative_name СЛУЖАНКИ> <collective_number 1>. 
СЛУГИ БОГАТОНА <alternative_name СЛУГА, СЛУГА БОГАТОНА> <collective_number 1>. 

<stage>Дѣйствіе въ столицѣ. </stage>



{'ГРАФИНЯ ГОРДАНА': {'alternative_names': ['ГОРДАНА'], 'collective_number': None}, 'Г-НЯ МИЛАНА': {'alternative_names': ['МИЛАНА']

{'title': "V sem''e ne bez uroda",
 'author': 'Unknown, Unknown',
 'creation_date': 1813,
 'characters': {'ГРАФИНЯ ГОРДАНА': {'alternative_names': ['ГОРДАНА'],
   'collective_number': None},
  'Г-НЯ МИЛАНА': {'alternative_names': ['МИЛАНА'], 'collective_number': None},
  'Г-ФЪ ВСПЫЛЬЧИНЪ': {'alternative_names': ['ВСПЫЛЬЧИНЪ'],
   'collective_number': None},
  'ЗЛОВРЕДЪ': {'alternative_names': None, 'collective_number': None},
  'БОГАТОНЪ': {'alternative_names': None, 'collective_number': None},
  'КРАСАНА': {'alternative_names': None, 'collective_number': None},
  'МИЛОНЪ': {'alternative_names': None, 'collective_number': None},
  'РАЗУМА': {'alternative_names': None, 'collective_number': None},
  'ЧУВСТВИНЪ': {'alternative_names': None, 'collective_number': None},
  'ПУСТОНЪ': {'alternative_names': None, 'collective_number': None},
  'Э3ЕЛЬСКОПФЪ': {'alternative_names': None, 'collective_number': None},
  'ПАНДАРЪ': {'alternative_names': None, 'collective_number': None},
  'ДОГАДА': {

In [69]:
process_all_plays('.', '../../Test/')

['R_20.txt', 'R_23.txt', 'R_26.txt', 'R_24.txt', 'R_18.txt', 'R_19.txt', 'R_25.txt', 'R_8.txt', 'R_9.txt', 'R_7.txt', 'R_4.txt', 'R_5.txt', 'R_14.txt', 'R_15.txt', 'R_13.txt', 'R_11.txt']


Unnamed: 0,Index,Title,Last_Name,First_Name,Date,Translation,Number_of_Acts,URL,original_file_format,derived_format
0,R_1,Samoliubivyi stikhotvorets,Nikolev,Nikolai,1775,0,5.0,https://dracor.org/api/corpora/rus/play/nikole...,TEI,
1,R_2,Khvastun,Kniazhnin,Iakov,1785,0,5.0,https://dracor.org/api/corpora/rus/play/knyazh...,TEI,
2,R_3,Chudaki,Kniazhnin,Iakov,1790,0,5.0,https://dracor.org/api/corpora/rus/play/knyazh...,TEI,
3,R_4,Prestupnik ot igry ili bratom prodannaia sestra,Efim’ev,Dmitrii,1788,0,5.0,,pdf,txt
4,R_5,Smekh i gore,Klushin,Aleksandr,1792,0,5.0,http://az.lib.ru/k/klushin_a_i/text_0030.shtml,,txt


R_20.txt
Character cast text >:

ПЛАМЕНОВЪ
ТОНСКОЙ	<	Авторы:
ТРУТНЕВЪ	'	...........
ЛЮБИМЪ , другъ Пламенова.
ЛЕЗВИНСКІЙ, журналистъ.
ПІОНИНА
КНЯГИНЯ ТИРСКАЯ., молодая вдова, <alternative_name КНЯГИНЯ> 
ея племянница............Г-
ПАРАША, горничная Княгини. .
ЧЛЕНЫ ЛИТЕРАТУРНАГО ОБЩЕСТВА <alternative_name ОДИН ИЗЪ ЧЛЕНОВЪ>  <collective_number 1> 
ПОСѢТИТЕЛИ
Г. Шогаловъ.
Г. Женскіп.
Г. Щспкпнъ.
Г. Кодловскій.
Г.
Г-жа Лисицына
жа Львова-Сннецкая.
Г-жа Нагаева.
<stage>Дѣйствіе въ МосквЬ, вь домЬ Княгини Тирской</stage>.

<stage>Богато убранная зала; на авансценѣ кЪ сшоронЪ большой сіполЪ, накрытый краснымЪ сукномЪ ; кругомЪ онаго креслы, на столѣ журналы, бумаги и проч..., двое дверей, на право вЪ гостиную, и вЪ задиемЪ занавЪсЪ для входа.</stage>



{'ПЛАМЕНОВЪ': {'alternative_names': None, 'collective_number': None}, 'ТОНСКОЙ': {'alternative_names': None, 'collective_number': None}, 'ТРУТНЕВЪ': {'alternative_names': None, 'collective_number': None}, 'ЛЮБИМЪ': {'alternative_names': None,

R_24.txt
Character cast text 
ГРАФЪ ВЕЛЛИНСКОЙ <alternative_name ГРАФЪ>
ГРАФИНЯ
ЛИЗАВЕТА ИВАНОВНА <alternative_name ЛИЗА>
БЕЛЬСКОЙ.
КНЯЗЬ БАБИКОВЪ. <alternative_name КНЯЗЬ>
КНЯГИНЯ ВѢТРИНСКАЯ. <alternative_name КНЯГИНЯ>
АНТИПЪ.
<not_listed_character ЯМЩИКЪ> 
<not_listed_character СЛУГА <alternative_name ЛАКЕЙ>> 
<stage>w w w w, w w w w w w-w</stage>


{'ГРАФЪ ВЕЛЛИНСКОЙ': {'alternative_names': ['ГРАФЪ'], 'collective_number': None}, 'ГРАФИНЯ': {'alternative_names': None, 'collective_number': None}, 'ЛИЗАВЕТА ИВАНОВНА': {'alternative_names': ['ЛИЗА'], 'collective_number': None}, 'БЕЛЬСКОЙ': {'alternative_names': None, 'collective_number': None}, 'КНЯЗЬ БАБИКОВЪ': {'alternative_names': ['КНЯЗЬ'], 'collective_number': None}, 'КНЯГИНЯ ВѢТРИНСКАЯ': {'alternative_names': ['КНЯГИНЯ'], 'collective_number': None}, 'АНТИПЪ': {'alternative_names': None, 'collective_number': None}, 'ЯМЩИКЪ': {'alternative_names': None, 'collective_number': None}, 'СЛУГА': {'alternative_names': ['ЛАКЕЙ'], 'collectiv

R_8.txt
Character cast text :
ДОБРОНЪ.
ЛЮБИМА, его дочъ.
ЧЕСТИНА,
ЧЕСТИНЪ.
СВОЕНРАВЪ......,
И3ЛѢТЪ.
ПЕРЕСУДИНЪ.
КРЮЧКОТВОРЪ, судъя уволенный отъ должности.
ВИРШИНЪ, поэтъ.
ВѢТРАНА, служанка Любимы.
СКОРОХВАТЪ, слуга Своенрава.
ФАЛИНЪ, слуга Пересудина.
СЛУГИ Доброна <collective_number 1>

<stage>Дѣйствіе въ деревнѣ, въ дом Добропа</stage>
 

{'ДОБРОНЪ': {'alternative_names': None, 'collective_number': None}, 'ЛЮБИМА': {'alternative_names': None, 'collective_number': None}, 'ЧЕСТИНА': {'alternative_names': None, 'collective_number': None}, 'ЧЕСТИНЪ': {'alternative_names': None, 'collective_number': None}, 'СВОЕНРАВЪ': {'alternative_names': None, 'collective_number': None}, 'И3ЛѢТЪ': {'alternative_names': None, 'collective_number': None}, 'ПЕРЕСУДИНЪ': {'alternative_names': None, 'collective_number': None}, 'КРЮЧКОТВОРЪ': {'alternative_names': None, 'collective_number': None}, 'ВИРШИНЪ': {'alternative_names': None, 'collective_number': None}, 'ВѢТРАНА': {'alternative_names': None, 'colle

IndexError: list index out of range

In [None]:
pattern =r'[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+ [А-Я+Ѣ+І] '
test_str = 'КАМЕРДИНЕРЪ К. ВРОНСКОГО.'
[i.group() for i in re.finditer(pattern, test_str)]