This notebook contains the pipeline for extracting features from the Russian comedies in the .txt format.

In [1]:
import os
import pandas as pd
from os import listdir
from os import path
from os.path import isfile, join
import re
import numpy as np
import string 
from collections import Counter
import json

In [2]:
metadata_file = pd.read_csv('Russian_Comedies.txt', sep='\t')

In [3]:
metadata_file.head(3)

Unnamed: 0,index,title,first_name,last_name,creation_date,translation,num_acts,url,original_file_format,derived_format,additional_info,Unnamed: 11
0,R_1,Samoliubivyi stikhotvorets,Nikolev,Nikolai,1775,0,5,https://dracor.org/api/corpora/rus/play/nikole...,TEI,TEI,"iambic hexameter, paired rhyme",
1,R_2,Khvastun,Kniazhnin,Iakov,1785,0,5,https://dracor.org/api/corpora/rus/play/knyazh...,TEI,TEI,"iambic hexameter, paired rhyme",
2,R_3,Chudaki,Kniazhnin,Iakov,1790,0,5,https://dracor.org/api/corpora/rus/play/knyazh...,TEI,TEI,"iambic hexameter, varying rhyming patterns",


In [4]:
def split_text(play_file, old_ortho_flag=True):
    """
    The function splits the text into two parts: the first with the dramatic character cast
    and the second one with the text of the play.
    Params:
        play_file - string with the text of the entire play.
        old_ortho_flag - bool, indicating whether the text is in the old Russian orthography.
    Returns:
        cast_text - string, contains the list of the dramatic characters.
        play_text - string, play text aftet the list of dramatic characters.
    """
    if old_ortho_flag:
        acting_characters = 'ДѢЙСТВУЮЩІЕ'
        act = 'ДѢЙСТВІЕ'
    else:
        acting_characters = 'ДЕЙСТВУЮЩИЕ'
        act = 'ДЕЙСТВИЕ'
        
    cast_start_index = play_file.find('{} ЛИЦА'.format(acting_characters)) 
    if cast_start_index == -1:
        cast_start_index = play_file.find('{} <ЛИЦА>'.format(acting_characters))
        if cast_start_index == -1:
            cast_start_index = play_file.find('<{}> ЛИЦА'.format(acting_characters))
            if cast_start_index == -1:
                cast_start_index = play_file.find('ДѢЙСТВУЮЩIЯ ЛИЦА'.format(acting_characters))
    cast_end_index = play_file.find(act)
    cast_text = play_file[cast_start_index:cast_end_index].split('ЛИЦА')[1]
    play_text = play_file[cast_end_index:]
    
    return cast_text, play_text

In [5]:
def identify_character_names(line):
    """
    The function identifies which character names are present in the string
    Params:
        line - each line from string from the text with charcters (split at '\n')
    Returns:
        characters - a list of character names or 0 if not present in that line
    """
    pattern = r'[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+ [А-Я+Ѣ+І] |[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+'
    characters = [name.group() for name in re.finditer(pattern, line)]
    if len(characters) > 0:
        return characters
    else:
        return 0

In [6]:
def handle_alternative_names(line):
    """
    The function helps extract alternative names for each dramatic character in character list.
    Params:
        line - a string corresponding to a line in cast_text.
    Returns:
        cast_dictionary - a dictionary with alternative names for each dramatic character, values are lists of 
        names (strings).
    """
    cast_dictionary = {}
    character_name = identify_character_names(line)[0].strip()
    tag = '<alternative_name'
    alternative_names = line[line.find(tag)+len(tag):line.find('>')].strip().split(', ')
    
    return character_name, alternative_names

In [7]:
def make_reverse_dictionary(cast_dictionary):
    """
    The function allows us to reverse a cast dictionary, so that we could look up the character names in reverse, e.g.
    from 'COUNT' to 'COUNT VIAZEMSKII'.
    Params:
        cast_dictionry -  dictionary where keys are character names as they appear in the cast list and 
                          values are character names as they appear in the play text.
    Returns:
        reverse_cast_dictionary - the dictionary with keys and values reversed.
    """
    reverse_cast_dictionary = {}
    for item in cast_dictionary.items():
        if item[1]['alternative_names']:
            for name in item[1]['alternative_names']:
                reverse_cast_dictionary[name] = item[0]
            
    return reverse_cast_dictionary

In [8]:
def get_collective_number(line):
    """
    The function extracts collective number, i.e., whether a character like guards should be counted as 1 or 2 
    based on the text of the play, from a text line character cast.
    Params:
        line - a string corresponding to a line in dramatic character cast.
    Returns:
        name - dramatic character name
        collective_number - the number corresponding to the dramatic character, e.g. 2.
    """
    name = identify_character_names(line)[0]
    match = re.findall(r'<collective_number \d>', line)
    collective_number = match[-1].split(' ')[-1]
    
    return name, collective_number.replace('>', '')

In [9]:
def identify_character_cast(cast_string):
    """
    The function parses the string with the dramatic character cast and creates a dictionary with information
    about each dramatic character's alternative names and collective numbers. If no alternative names or 
    collective numbers, None is recorded.
    Params:
        cast_string - a string containing a list of dramatic characters of the play.
    Returns:
        cast - a dictionary where keys are dramatic characters and values are their alternative names 
        and collective numbers.
    """
    cast = {}
    for line in cast_string.split('\n'):
        characters = identify_character_names(line)
        if characters != 0:
            if line.find('<alternative_name')!= -1 and line.find('<collective_number ') != -1:
                character_name, alternative_name  = handle_alternative_names(line)
                _, collective_number = get_collective_number(line)
                cast[character_name] = {'alternative_names': alternative_name, 'collective_number': collective_number}
            elif line.find('<alternative_name')!= -1:
                character_name, alternative_name  = handle_alternative_names(line)
                cast[character_name] = {'alternative_names':alternative_name, 'collective_number': None}
            elif line.find('<collective_number ') != -1:
                character_name, collective_number = get_collective_number(line)
                cast[character_name] = {'alternative_names': None, 'collective_number': collective_number}
                
            else:
                cast[characters[0]] =  {'alternative_names': None, 'collective_number': None}
                
    return cast

In [10]:
def split_a_scene(scene_string):
    """
    The function splits a string of a scene into two parts: first containig the dramatic characters, 
    the second with the text of the scene.
    Params:
        scene_string - string with the text of the scene.
    Returns:
        scene_cast - a string with dramatic characters in the scene.
        scene_itself - a string with the text of the scene, without dramatic characters enumeration.
    """
    starting_with_cast = scene_string[scene_string.find('<cast '):]
    scene_itself = starting_with_cast.replace(starting_with_cast[:starting_with_cast.find('>')]+'>','') 
    scene_cast = starting_with_cast[:starting_with_cast.find('>')].replace('<cast ', '').strip().split(', ')
    
    return scene_cast, scene_itself

In [11]:
def quality_check_cast(cast_list, character_cast_dictionary, reverse_character_cast):
    """
    The function checks if all dramatic characters which are listed for a particular scene can be found
    either in the character_cast_dictionary or reverse_character_cast. This allows us to check for potential errors
    in the text.
    Params:
        cast_list - a list with dramatic characters which are expected in the scene.
        character_cast_dictionary - a dictionary where keys are dramatic character names as they appear in the list,
                                    values are other names used for the same characters in the text.
       reverse_cast_dictionary - a dictionary where keys and values of the character_cast_dictionary are switched.
    Returns:
        No return, raises an error.
    """
    for name in cast_list:
        if name not in character_cast_dictionary:
            if name in reverse_character_cast:
                pass
            else:
                raise Exception("Error. Name not found', name")

In [12]:
def get_scene_status(scene):
    """
    The function checks a scene and identifies its status, whether it is it 'extra' meaning that it marks a dramatic
    character entrance or exit but it is not marked in the publication; 'no_change' means that it is marked in the 
    publication but no change in character cast happens; 'regular' means that that it simply corresponds to the 
    publication, and it is not extra or no change.
    Params:
        scene - a string with the text of a play scene.
    Returns:
        scene_status - either 'regular', 'no_change' or 'extra'.
    """
    if scene.count('SCENE>')== 0 and scene.count('<no_change_SCENE>')==0:
        scene_status = 'regular'
    elif scene.count('<no_change_SCENE>')>0:
        scene_status = 'no_change'
    elif scene.count('SCENE>')>0 and scene.count('<no_change_SCENE>')==0:
        scene_status = 'extra'
    
    return scene_status

In [13]:
def check_alternative_names(name, character_cast_dictionary, reverse_character_cast):
    """
    The function checks if a dramatic character name has any alternative variants.
    Params:
        name - string, dramatic character name.
        character_cast_dictionary, reverse_character_cast - dictionaries for looking up alternative character names.
    """
    try:
        alt_names = character_cast_dictionary[name]['alternative_names']
    except KeyError:
        alt_names = character_cast_dictionary[reverse_character_cast[name]]['alternative_names']
        
        return alt_names

In [14]:
def check_utternaces_by_alternative_names(alt_names, reverse_character_cast, utterances):
    """
    Count utternaces that appear in the text under alternative dramatic character names.
    Params:
        alt_names - a list of alternative names for a dramatic character.
        reverse_character_cast - a dictionary where keys are names as they appear in the text, values- names as they 
                                appear in the list at the beginning of the play.
        utterances - a list of dramatic character names extracted from the text of the scene.
    Returns:
        speaker_total - int, the number of utternaces by a speaker in the scene.
    """
    speaker_total = 0
    for alt in alt_names: 
        try:
            speaker_total+=utterances.count(alt)
        except KeyError:
            speaker_total+=utterances.count(reverse_character_cast[alt])
    
    return speaker_total

In [15]:
def count_utterances(scene_cast, utterances, character_cast_dictionary, reverse_character_cast):
    """
    The function counts the number of utternaces for each dramatic character listed for the scene.
    Params:
        scene_cast - a list of dramatic characters which are present in the scene.
        utterances - a list of dramatic character names extracted from the text of the scene.
        character_cast_dictionary, reverse_character_cast - dictionaries for looking up alternative character names.
    Returns:
        scene_info - a dictionary where keys are dramatic character names and values are numbers of utterances.
    """
    scene_info = {}
    # if there is only one character in a scene, he will have one utterance
    if len(scene_cast) == 1:
            scene_info[scene_cast[0]] = 1
    else:
        for name in scene_cast:
            utterance_count = utterances.count(name)
            if utterance_count != 0:
                scene_info[name] = utterance_count
            # in case the character appears in the text under a different name
            else:
                alt_names = check_alternative_names(name, character_cast_dictionary, reverse_character_cast)
                #in case there are alternative names
                if alt_names:
                    # there may be a few alternative names associated with a character
                    speaker_total = check_utternaces_by_alternative_names(alt_names, reverse_character_cast, utterances)
                    scene_info[name] = speaker_total
                else:
                    scene_info[name] = utterances.count(name)
                    
    return scene_info                 

In [16]:
def compare_two_scenes(cast_one, cast_two):
    """
    The function helps identify if the dramatic character cast changed.
    Params:
        cast_one - a list of characters in scene one.
        cast_two - a list of characters in scene two.
    Returns:
        no_change_scene - 'no_change_scene' if two scenes are the same, None otherwise.
    """
    if set(cast_one) == set(cast_two):
        no_change_scene = 'no_change_scene'
    else:
        no_change_scene = None
    
    return no_change_scene

In [17]:
def count_speaking_characters(scene_summary_dict, scene_cast):
    """
    The function parses scene_summary_dict with information about number of utterances by each character
    and identifies the total number of speakers in the scene.
    Params:
        scene_summary_dict: a dictionary where keys are dramatic characters and values are number of utterances.
        scene_cast - a list of characters present in the scene.
    Returns:
        num_speakers - a number of speaking dramatic characters in the scene.
    """
    num_speakers = len([item[0] for item in scene_summary_dict.items() if item[1] != 0 and item[0] in scene_cast])
    
    return num_speakers

In [18]:
def handle_scene_name_and_count(scene, sc_num, extra_scene_number):
    """
    The function checks the scene status, whether it is extra or not and assigns the number. Extra scenes are counted
    as for example 1.1 the first extra scene of the main scene 1.
    Params:
        scene - text of a scene.
        sc_number - number of the scene as it appears in the order of all scenes for a particular act.
        extra_scene_number - the number of the extra scene for each main scene, e.g. 1.1, 1.2, 1.3 etc. 
    Returns:
        scene_status - whether a scene is regular, no_change, or extra.
        sc_number - number of the main scene.
        extra_scene_number - number of the extra scene.
    """
    sc_num = int(float(sc_num))
    scene_status = get_scene_status(scene)
    if scene_status == 'extra':
        sc_num = str(sc_num)+ '.'+str(extra_scene_number)
        extra_scene_number+=1
    else:
        sc_num +=1
        extra_scene_number = 1
        
    return scene_status, sc_num, extra_scene_number

In [19]:
def parse_scenes(scenes, name_pattern, character_cast_dictionary, reverse_character_cast):
    sc_num = 0
    complete_scene_info = {} 
    scene_names = []
    extra_scene_number = 1
    statuses = []
    for scene in scenes:
        scene_status, sc_num, extra_scene_number = handle_scene_name_and_count(scene, sc_num, extra_scene_number)
        # split a scene string into two substrings, one with cast, the other - without
        scene_cast, scene_itself = split_a_scene(scene)
        # check if the cast for the new scene is different from the previous scene
        if len (scene_names) >0:
            no_change = compare_two_scenes(scene_cast, complete_scene_info[scene_names[-1]])
            if no_change:
                scene_status = no_change            
        #check to make sure all character names are in scene cast as they appear in the play cast
        quality_check_cast(scene_cast, character_cast_dictionary, reverse_character_cast)
        scene_names.append(str(sc_num)+'_'+str(scene_status))
        utterances =  [name.group().strip() for name in re.finditer(name_pattern, scene_itself)]
        scene_summary = count_utterances(scene_cast, utterances, character_cast_dictionary, reverse_character_cast)
        scene_summary['num_utterances'] = sum(list(scene_summary.values()))
        scene_summary['num_speakers'] = count_speaking_characters(scene_summary, scene_cast)
        scene_summary['perc_non_speakers'] = ((len(scene_cast) - scene_summary['num_speakers']) / 
                                            len(scene_cast)) * 100
        complete_scene_info[str(sc_num)+'_'+str(scene_status)] =  scene_summary
    
    return complete_scene_info

In [20]:
def parse_play(play_text, name_pattern, number_acts, old_ortho_flag, 
               character_cast_dictionary, reverse_character_cast):
    if old_ortho_flag == True:
        rus_act = 'ДѢЙСТВІЕ'
        rus_scene = 'ЯВЛЕНІЕ'
    else:
        rus_act = 'ДЕЙСТВИЕ'
        rus_scene = 'ЯВЛЕНИЕ'
    acts = play_text.split(rus_act)[1:]
    if len(acts)!= number_acts:
        print('The number of acts is not {}.'.format(number_acts))
    else:
        act_info = {}
        for act_num, act in enumerate(acts, 1):
            scenes = re.split('{}|<extra'.format(rus_scene),act)[1:]
            act_info['act'+'_'+str(act_num)] = parse_scenes(scenes, 
                                                            name_pattern, 
                                                            character_cast_dictionary,
                                                            reverse_character_cast)
        
        return act_info

In [21]:
def splitting_verse_line(scene):
    splits= re.split('<end_verse_line>|<end_verse_line_interscene_rhyme>', scene)
    
    return splits

In [22]:
def remove_numbers(input_string):
    if input_string.isalpha() is False:
        numbers = re.findall('\d+',input_string)
        for num in numbers:
            input_string = input_string.replace(num, '')
            
    return input_string

In [23]:
def replace_tags(line):
    for _ in range(line.count('<stage>')):
        stage_direction_text = line[line.find('<stage>')+len('<stage>'):line.find('</stage>')]
        line = line.replace('<stage>'+stage_direction_text+'</stage>', ' STAGE ')
        
    return line

In [24]:
def clean_stage_direction(line):
    entities = re.findall(r'ЯВЛЕНІЕ +\w+|ЯВЛЕНИЕ +\w+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+', line)
    punctuation = [symb for symb in string.punctuation + '—' + '\n' + '\t']
    tags = ['extra_SCENE', 'cast', 'no_change_SCENE', 'intermedia', 'stage separator',
            'speaker_clarification', 'speaking_character_no_utterance']              
    for item in tags + punctuation + entities:
        line = line.replace(item, ' ')
        
    return line

In [25]:
def remove_start_end_stage_directions(verse_line):
    verse_line = verse_line.strip()
    while verse_line[:5]=='STAGE' or verse_line[-5:]=='STAGE':
        if verse_line[-5:]=='STAGE':
            verse_line = verse_line[:-5].strip()
        elif verse_line[:5]=='STAGE':
            verse_line = verse_line[5:].strip()
    return verse_line

In [26]:
def estimate_verse_line_splitting_stage_directions(text_string):
    # split verse lines
    splits = splitting_verse_line(text_string)
    number_splitting_stage_directions = 0
    for split in splits:
        # remove any numbers that could appear in the string
        line = remove_start_end_stage_directions(
                                                clean_stage_direction(
                                                replace_tags(
                                                remove_numbers(split))))
        number_splitting_stage_directions+= line.count('STAGE')
        
    return number_splitting_stage_directions

In [27]:
def count_number_word_tokens(play_text):
    total_number_word_tokens = 0
    indices = []
    for index_pair in zip([i.span()[1] for i in re.finditer(r'<stage>', play_text)], 
                   [i.span()[0] for i in re.finditer(r'</stage>', play_text)]):
        indices.append(index_pair)
    for index_pair in indices:
        stage_direction = play_text[index_pair[0]:index_pair[1]]
        punctuation = [symb for symb in string.punctuation + '—' + '\n' + '\t']
        for punct in punctuation:
            stage_direction = stage_direction.replace(punct, '')
        num_tokens = len(stage_direction.strip().split(' '))
        total_number_word_tokens += num_tokens   
        
    return total_number_word_tokens

In [28]:
def check_end_of_scene(scene_string):
    if scene_string[-16:] != '<end_verse_line>': 
        if scene_string[-33:] != '<end_verse_line_interscene_rhyme>':
            return True
        else:
            return False
    else:
        return False

In [29]:
def tackle_alternative_scene(play_text):
    play_text = play_text.replace('<ЯВЛЕНІЕ>', 'ЯВЛЕНІЕ')
    play_text = play_text.replace('<ЯВЛЕНИЕ>', 'ЯВЛЕНИЕ')
    
    return play_text

In [30]:
def verse_split_between_scenes(play_text, old_ortho_flag=True):
    if old_ortho_flag:
        rus_scene = 'ЯВЛЕНІЕ'
    else:
        rus_scene = 'ЯВЛЕНИЕ'
    # make sure the scenes with alternative mark up (where scenes are not using the word ЯВЛЕНІЕ in the text)
    play_text = tackle_alternative_scene(play_text)
    scenes_split_verses = 0
    scenes_rhymes = 0
    both = 0
    scenes = re.split('{}|<extra'.format(rus_scene),play_text)
    for scene in scenes[1:]:
        if scene.count('<end_verse_line_interscene_rhyme>') > 0:
            scenes_rhymes +=1
        scene_cleaned = replace_tags(remove_numbers(scene)).strip()
        if check_end_of_scene(scene_cleaned):
                entities = re.findall(r'ЯВЛЕНІЕ +\w+|ЯВЛЕНИЕ +\w+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+', scene)
                symbols = [symb for symb in string.punctuation + '—' + '\n' + '\t' 
                           if symb not in ['_', '<', '>']] + ['STAGE'] + entities
                for symbol in symbols:
                    scene_cleaned = scene_cleaned.replace(symbol, '').strip()
                if check_end_of_scene(scene_cleaned):
                    scenes_split_verses+=1
                    if scene_cleaned.count('<end_verse_line_interscene_rhyme>') > 0:
                        both+=1
    return scenes_split_verses, scenes_rhymes, both

In [31]:
def estimate_number_scenes(scene_summary):
    """
    The function calcualtes the number of scenes per text and per Iarkho (i.e., as marked by actual dramatic character
    entrances and exits).
    Params:
        scene_summary - a dictionary output of the parse_play function.
    Returns:
        total_number_scenes_per_text - number of scenes as they are printed
        total_number_scenes_iarkho - number of scnes per Iarkho, which he calls mobility coefficient (MC)
    """
    total_number_scenes_per_text = 0
    total_number_scenes_iarkho = 0
    for key in scene_summary.keys():
        # get the number of scenes as it is printed in the text
        total_number_scenes_per_text+=len([scene for scene in scene_summary[key].keys() if scene.count('regular')>0])
        # count scenes as marked by actual entrances and exits                                  
        total_number_scenes_iarkho+=len([scene for scene in scene_summary[key].keys() if scene.count('no_change')==0])
    
    return total_number_scenes_per_text, total_number_scenes_iarkho

In [32]:
def sigma_iarkho(variants, weights):  
    """ 
    The function allows calculating standard range following iarkho's procedure. 
    Parameters: 
        variants - a list with distinct variants in the ascending order, e.g. [1, 2, 3, 4, 5] 
        weights - a list of weights corresponding to these variants, e.g. [20, 32, 18, 9, 1] 
    Returns: 
        sigma - standard range per iarkho 
    """  
    weighted_mean_variants = np.average(variants, weights=weights)  
    differences_squared = [(variant - weighted_mean_variants)**2 for variant in variants] 
    weighted_mean_difference = np.average(differences_squared, weights=weights)  
    sigma = weighted_mean_difference**0.5  
      
    return sigma 


In [33]:
def parse_play_summary(play_summary):
    """
    The function parses the dictionary with play_summary produced by parse_play function
    and outputs total number of utterances 
    Params:
        play_summary - a dictionary output by parse_play function.
        
    Returns:
        total_utterances_in_play - total number of utterances in a play.
    """
    total_utterances_in_play = 0
    total_non_duologues = 0
    for key in play_summary.keys():
        for scene in play_summary[key].keys():
            total_utterances_in_play += play_summary[key][scene]['num_utterances']         
    
    return total_utterances_in_play

In [34]:
def number_present_characters(play_dictionary):
    all_present_characters = set()
    for key in play_dictionary['play_summary'].keys():
        for scene in play_dictionary['play_summary'][key]:
            for item in play_dictionary['play_summary'][key][scene].keys():
                if item != 'num_utterances' and item != 'num_speakers' and item != 'perc_non_speakers':
                    all_present_characters.add(item)
    total_number_present_characters = 0
    for character in play_dictionary['characters'].keys():
        alt_names = play_dictionary['characters'][character]['alternative_names']
        # in case there are alternative names
        if alt_names:
            possible_names = [character] + alt_names
        else:
            possible_names = [character]
        if len(set(possible_names).intersection(set(all_present_characters))) > 0:
            coll_number = play_dictionary['characters'][character]['collective_number']
            # if there is a collective number for this character
            if coll_number:
                total_number_present_characters += int(coll_number)
            else:
                total_number_present_characters += 1
                
    return total_number_present_characters

In [35]:
def percentage_of_each_speech_type(speech_distribution):
    total_scenes = np.sum([speech_type[1] for speech_type in  speech_distribution])
    perc_monologue = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                         if speech_type[0] ==1]) / total_scenes) *100, 2)
    perc_duologue = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                             if speech_type[0] == 2])/ total_scenes) * 100, 2)
    perc_non_duologue = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                             if speech_type[0] != 2])/ total_scenes) * 100, 2)
    perc_over_two_speakers = np.round((np.sum([speech_type[1] for speech_type in  speech_distribution 
                             if speech_type[0] > 2])/ total_scenes) * 100, 2)
    
    return (perc_monologue, perc_duologue, perc_non_duologue, perc_over_two_speakers)

In [36]:
def speech_distribution_iarkho(play_summary):
    """
    The function creates speech distrubution per Iarkho, i.e., the number of speaking characters by number of scenes.
    Params:
        play_summary - a dictionary output by parse_play function.
    Returns:
        speech_distribution - a list of tuples were the 0 element is the number of speaking characters
                              and the 1 element is the number of scenes with such number of speaking characters.
    """
    speakers = []
    perc_non_speakers = []
    for key in play_summary.keys():
        for scene in play_summary[key]:
            speakers.append((play_summary[key][scene]['num_speakers']))
            perc_non_speakers.append(round(play_summary[key][scene]['perc_non_speakers'], 3))
    counter = Counter
    counted = counter(speakers)
    speech_distribution = sorted(counted.items(), key=lambda pair: pair[0], reverse=False)
    speech_types = percentage_of_each_speech_type(speech_distribution)
    av_perc_non_speakers = round(np.mean((perc_non_speakers)), 3)
    
    return speech_distribution, speech_types, av_perc_non_speakers

In [37]:
def process_speakers_features(play_string, play_data, metadata_dict, old_ortho_flag):
    metadata_dict['num_present_characters'] = number_present_characters(play_data)
    metadata_dict['num_scenes_text'] = estimate_number_scenes(play_data['play_summary'])[0]
    metadata_dict['num_scenes_iarkho'] = estimate_number_scenes(play_data['play_summary'])[1]
    metadata_dict['speech_distribution'] = speech_distribution_iarkho(play_data['play_summary'])[0]
    metadata_dict['percentage_monologues'] = speech_distribution_iarkho(play_data['play_summary'])[1][0]
    metadata_dict['percentage_duologues'] = speech_distribution_iarkho(play_data['play_summary'])[1][1]
    metadata_dict['percentage_non_duologues'] = speech_distribution_iarkho(play_data['play_summary'])[1][2]
    metadata_dict['percentage_above_two_speakers'] = speech_distribution_iarkho(play_data['play_summary'])[1][3]
    metadata_dict['av_percentage_non_speakers'] = speech_distribution_iarkho(play_data['play_summary'])[2]
    metadata_dict['sigma_iarkho'] = round(sigma_iarkho(
                                        [item[0] for item in metadata_dict['speech_distribution']],
                                        [item[1] for item in metadata_dict['speech_distribution']]), 3)
    
    return metadata_dict

In [38]:
def process_features_verse(play_string, play_data, metadata_dict, old_ortho_flag):
    metadata_dict['total_utterances'] = parse_play_summary(play_data['play_summary'])
    metadata_dict['num_verse_lines'] = play_string.count('<end_verse_line>') + play_string.count('<end_verse_line_interscene_rhyme>')
    metadata_dict['dialogue_vivacity'] = round(metadata_dict['total_utterances'] / 
                                                       metadata_dict['num_verse_lines'], 3)
    metadata_dict['num_scenes_with_split_verse_lines'] = verse_split_between_scenes(play_string, old_ortho_flag)[0]
    metadata_dict['num_scenes_with_split_rhymes'] = verse_split_between_scenes(play_string, old_ortho_flag)[1]
    metadata_dict['percentage_scene_split_verse'] = round((metadata_dict['num_scenes_with_split_verse_lines'] / 
                                                     metadata_dict['num_scenes_iarkho'])*100, 3)
    metadata_dict['num_scenes_with_split_rhymes_verses'] = verse_split_between_scenes(play_string, old_ortho_flag)[2]
    metadata_dict['num_open_scenes'] = (metadata_dict['num_scenes_with_split_verse_lines'] + 
                                           metadata_dict['num_scenes_with_split_rhymes'] - 
                                           metadata_dict['num_scenes_with_split_rhymes_verses'])
    metadata_dict['percentage_open_scenes'] = round((metadata_dict['num_open_scenes']/
                                                    metadata_dict['num_scenes_iarkho']) * 100, 3)
    metadata_dict['percentage_scenes_rhymes_split_verse'] = round((metadata_dict['num_scenes_with_split_rhymes_verses']/
                                                    metadata_dict['num_scenes_iarkho']) * 100, 3)
    
    return metadata_dict

In [39]:
def process_stage_directions_features(play_string, play_data, metadata_dict, old_ortho_flag):
    # stage-directions related features
    metadata_dict['num_stage_directions'] = play_string.count('<stage>')
    metadata_dict['stage_directions_frequency'] = round((metadata_dict['num_stage_directions'] /
                                                       metadata_dict['num_verse_lines']) * 100, 3)
    metadata_dict['num_word_tokens_in_stage_directions'] = count_number_word_tokens(play_string)
    metadata_dict['average_length_of_stage_direction'] = round(metadata_dict['num_word_tokens_in_stage_directions']/
                                                               metadata_dict['num_stage_directions'], 3)
    metadata_dict['num_verse_splitting_stage_directions'] = estimate_verse_line_splitting_stage_directions(play_string)
    metadata_dict['degree_of_verse_prose_interaction'] = round((metadata_dict['num_verse_splitting_stage_directions'] /
                                                              metadata_dict['num_verse_lines']) * 100, 3)
    
    return metadata_dict

In [40]:
def additional_metadata(play_string, play_data, old_ortho_flag):
    metadata_dict = {}
    for process in [process_speakers_features, process_features_verse, process_stage_directions_features]:
        metadata_dict = process(play_string, play_data, metadata_dict, old_ortho_flag)

    return metadata_dict

In [41]:
def add_play_info(metadata):
    play_data = {}
    play_data['title'] = metadata[0][0]
    play_data['author'] = metadata[0][1] + ', ' + metadata[0][2]
    play_data['creation_date'] = metadata[0][3]
    
    return play_data

In [42]:
def process_play(file_name, metadata_df, regex_pattern):
    """
    The function parses a txt file and creates a summary with features and metadata for the play.
    Params:
        file_name - a string, name of the file with the play text.
        metadata_df - a dataframe containing the info about the play.
    Returns:
        play_data - a dictionary with detailed play summary by scenes, metadata, and features
    """
    print(file_name)
    play_index = file_name.replace('Txt_files/', '').replace('.txt', '')
    play_meta = metadata_df[metadata_df['index']==play_index][['title', 'last_name', 
                                                           'first_name', 'creation_date']].values 
    comedy = open(file_name, 'r') .read()
    number_acts = int(metadata_df[metadata_df['index']==play_index]['num_acts'].values[0])

    # add logic for detecting if the text is in old orthography
    if comedy.count('Ѣ') >0:
        old_ortho_flag=True
    else:
        old_ortho_flag=False
    # split the text into the part with the cast names and the play itself
    cast_text, play_text = split_text(comedy, old_ortho_flag)

    character_cast_dictionary = identify_character_cast(cast_text)
    reverse_character_cast = make_reverse_dictionary(character_cast_dictionary)
    play_data = add_play_info(play_meta)
    play_data['characters'] = character_cast_dictionary.copy()
    play_data['play_summary'] = parse_play(play_text, regex_pattern, 
                                          number_acts, old_ortho_flag, 
                                          character_cast_dictionary, 
                                          reverse_character_cast)
    play_data['metadata'] = additional_metadata(play_text, play_data, old_ortho_flag)
    
    return play_data

In [43]:
def process_all_plays(input_directory, output_path, regex_pattern):
    """
    The function allows to process all files in a specified directory.
    Params:
        input_directory - the path to the folder containing the txt files
        output_path - directory in which the json summaries will be saved.
    Returns:
        no returns, the files will be saved in output_path directory.
    """
    all_files = [f for f in listdir(input_directory) if f.count('.txt')>0]
    metadata_df = pd.read_csv('Russian_Comedies.txt', sep='\t')
    for file in all_files:
        play_data_dict = process_play(input_directory+file, metadata_df, regex_pattern)
        json_name = output_path +str(file.replace('.txt', '.json')) 
        with open(json_name, 'w') as fp:
            json.dump(play_data_dict, fp, ensure_ascii=False, indent=2)

In [44]:
regex_pattern = '[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+\w[А-Я+Ѣ+І]|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+ [А-Я+Ѣ+І] |[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+ [А-Я+Ѣ+І]'
process_all_plays('Txt_files/', 'Play_Jsons/', regex_pattern)

Txt_files/R_21.txt
Txt_files/R_20.txt
Txt_files/R_23.txt
Txt_files/R_26.txt
Txt_files/R_24.txt
Txt_files/R_19.txt
Txt_files/R_25.txt
Txt_files/R_8.txt
Txt_files/R_9.txt
Txt_files/R_7.txt
Txt_files/R_4.txt
Txt_files/R_5.txt
Txt_files/R_14.txt
Txt_files/R_15.txt
Txt_files/R_12.txt
Txt_files/R_11.txt


pattern =r'[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+ [А-Я+Ѣ+І] '
test_str = 'КАМЕРДИНЕРЪ К. ВРОНСКОГО.'
[i.group() for i in re.finditer(pattern, test_str)]

comedy = open('Txt_files/R_7.txt', 'r') .read()
text_file = open("Txt_files/Test.txt", "w")
comedy = comedy.replace('СОВѣСТДРАЛЪ', 'СОВѢСТДРАЛЪ')
text_file.write(comedy)
text_file.close()

comedy = open('Txt_files/R_11.txt', 'r') .read()
cast_text, play_text = split_text(comedy, True)
estimate_verse_line_splitting_stage_directions(comedy)

sc_num = 0
complete_scene_info = {} 
scene_names = []
extra_scene_number = 1
statuses = []
scene = test
name_pattern = '[А-Я]+.\w[А-Я]+.\w[А-Я]+.\w[А-Я]+|[А-Я]+.\w[А-Я]+.\w[А-Я]+|[А-Я]+.\w[А-Я]+'
# add this logic to account for issues caused by extra scenes tracking
sc_num = int(float(sc_num))
scene_status = get_scene_status(scene)
if scene_status == 'extra':
    sc_num = str(sc_num)+ '.'+str(extra_scene_number)
    extra_scene_number+=1
else:
    sc_num +=1
    extra_scene_number = 1
# split a scene string into two substrings, one with cast, the other - without
scene_cast, scene_itself = split_a_scene(scene)
# check if the cast for the new scene is different from the previous scene
if len (scene_names) >0:
    no_change = compare_two_scenes(scene_cast, complete_scene_info[scene_names[-1]])
    if no_change:
        scene_status = no_change            
#check to make sure all character names are in scene cast as they appear in the play cast
quality_check_cast(scene_cast)
scene_names.append(str(sc_num)+'_'+str(scene_status))
utterances =  [name.group() 
              for name in re.finditer(name_pattern, scene_itself)]
print(utterances)
scene_summary = count_utterances(scene_cast, utterances)
scene_summary['num_utterances'] = sum(list(scene_summary.values()))
scene_summary['num_speakers'] = count_speaking_characters(scene_summary, scene_cast)
complete_scene_info[str(sc_num)+'_'+str(scene_status)] =  scene_summary

test = """
ЯВЛЕНІЕ ПОСЛѢДНЕЕ.
<stage>ДОБРОНЪ, ЛЮБИМА, СВОЕНРАВЪ, ЧЕСТИНА, ЧЕСТИНЪ,И3ЛѢТЪ, ВИРШИНЪ, ВѢТРАНА, СКОРОХВАТЪ, СВОЕНРАВЪ</stage>. <cast ЛЮБИМА, СВОЕНРАВЪ, ЧЕСТИНА, ЧЕСТИНЪ, И3ЛѢТЪ, ВИРШИНЪ, ВѢТРАНА, СКОРОХВАТЪ>
ВотЪ истинной чудакЪ!
ЛЮБИМА <stage>(Своенраву)</stage>
Не ужели ты могЪ . . . . <end_verse_line>  <stage>(СвоенравЪ цЪлуетЪ ее руку)</stage>, 
ЧЕСТИНЪ <stage>(Своенраву)</stage>. 
Мой другЪ, не будъ кѣ такимЪ повѢсамЪ
"""

test.find('ИЗЛѢТЪ')

'СОВѣСТДРАЛЪ' == 'СОВѢСТДРАЛЪ'

regex_pattern = '[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+\w[А-Я+Ѣ+І]|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+ [А-Я+Ѣ+І] |[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+|[А-Я+Ѣ+І]+.\w[А-Я+Ѣ+І]+ [А-Я+Ѣ+І]'
process_play('Txt_files/R_8.txt', pd.read_csv('Russian_Comedies.txt', sep='\t'), regex_pattern)