## Data Generation For Contrastive Sources

In this notebook, we generate a CSV file with the data for all Contrastive Materials, including William Shakespeare, Carlo Goldoni, and August von Kotzebue.

Our steps include:
1. Load all JSON files;
2. Combine the loaded data with the data from the metadata for these plays;
3. Generate an additional feature the coefficient of unused dramatic characters as described in this notebook: https://github.com/innawendell/European_Comedy/blob/master/Analyses/French_Comedy_Other_Features.ipynb;
4. Save the generated data in a CSV file.

In [1]:
import json
from os import listdir
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
def get_data(input_directory):
    all_files = [f for f in listdir(input_directory) if f.count('.json') > 0]
    dfs = []
    for file in all_files:
        with open(input_directory + '/' + file) as json_file:
            data = json.load(json_file)
            data['metadata']['speech_distribution'] = np.array(data['metadata']['speech_distribution'])
            df = pd.DataFrame(data['metadata'].values()).T
            df.columns = data['metadata'].keys()
            df['index'] = file.replace('.json','')
            dfs.append(df)
            
    features_df = pd.concat(dfs, axis=0, sort=False)
    
    return features_df

In [3]:
def combine_with_meta(input_directory, features_df):
    metadata_df = pd.read_csv(input_directory, sep='\t')
    all_data_df = metadata_df.merge(features_df, how='left', on='index')
    all_data_df = all_data_df.sort_values(by='date')
    
    
    return all_data_df

In [4]:
def coefficient_unused_dramatic_characters(data):
    total_present = 0
    total_non_speakers = 0
    for act in data['play_summary'].keys():
        for scene in data['play_summary'][act].keys():
            # identify the raw number of non-speaking dramatic characters
            num_non_speakers = len([item for item in data['play_summary'][act][scene].items() 
                                if (item[1] == 0  or item[1] == 'non_speaking') and item[0] not in ['num_utterances',
                                                                   'num_speakers',
                                                                   'perc_non_speakers']])
            total_non_speakers += num_non_speakers
            # calculate the total number of dramatic characters
            total_present += (data['play_summary'][act][scene]['num_speakers'] + num_non_speakers)
    coefficient_unused = (total_non_speakers / total_present ) * 100        
    
    return coefficient_unused

In [5]:
def load_jsons(input_directory):
    all_files = [f for f in listdir(input_directory) if f.count('.json') > 0]
    dfs = []
    for file in all_files:
        with open(input_directory + '/' + file) as json_file:
            data = json.load(json_file)
            not_used = coefficient_unused_dramatic_characters(data)
            df = pd.DataFrame([not_used], columns=['coefficient_unused'], index=[file.replace('.json','')])
            dfs.append(df)
            
    features_df = pd.concat(dfs, axis=0, sort=False).round(2)
    
    return features_df

In [6]:
# load the data from the jsons and combine it with metadata about each play
contrastive_data_df = combine_with_meta('../Contrastive_Material/Contrastive_material.tsv',
                               get_data('../Contrastive_Material/Play_Jsons'))

In [7]:
contrastive_data_df.head()

Unnamed: 0,index,title,last_name,first_name,date,num_acts,url,num_present_characters,num_scenes_text,num_scenes_iarkho,speech_distribution,percentage_monologues,percentage_duologues,percentage_non_duologues,percentage_above_two_speakers,av_percentage_non_speakers,sigma_iarkho,number_scenes_with_discontinuous_change_characters,percentage_scenes_with_discontinuous_change_characters
0,C_1,The Comedy of Errors,Shakespeare,William,1592,5,https://dracor.org/api/corpora/shake/play/the-...,19,11,53,"[[1, 11], [2, 17], [3, 13], [4, 6], [5, 2], [6...",20.75,32.08,67.92,47.17,22.182,1.823,9,16.981
1,C_3,The Two Gentlemen of Verona,Shakespeare,William,1593,5,https://dracor.org/api/corpora/shake/play/two-...,17,20,71,"[[1, 24], [2, 32], [3, 9], [4, 4], [5, 1], [6,...",33.8,45.07,54.93,21.13,14.272,1.021,19,26.761
2,C_4,Love’s Labor’s Lost,Shakespeare,William,1595,5,https://dracor.org/api/corpora/shake/play/love...,23,9,61,"[[1, 9], [2, 17], [3, 13], [4, 8], [5, 7], [6,...",14.75,27.87,72.13,57.38,32.54,1.89,8,13.115
3,C_5,The Merchant of Venice,Shakespeare,William,1596,5,https://dracor.org/api/corpora/shake/play/the-...,24,20,75,"[[1, 14], [2, 38], [3, 14], [4, 3], [5, 3], [6...",18.67,50.67,49.33,30.67,32.173,1.23,16,21.333
4,C_6,A Midsummer Night's Dream,Shakespeare,William,1596,5,https://dracor.org/api/corpora/shake/play/a-mi...,28,9,87,"[[1, 28], [2, 31], [3, 13], [4, 9], [5, 2], [6...",32.18,35.63,64.37,32.18,42.766,1.321,10,11.494


In [8]:
# calculate the coefficient of non-used dramatic characters
unused_coefficient = load_jsons('../Contrastive_Material/Play_Jsons/')
unused_coefficient['index'] = unused_coefficient.index.tolist()
contrastive_all_data_df = contrastive_data_df.merge(unused_coefficient, on='index')

#### Change the Data Type of Selected Columns to Numeric

In [9]:
columns_not_integers = ['speech_distribution', 'index', 'title', 'last_name', 'first_name', 'url']
numeric_columns = [col for col in contrastive_all_data_df.columns.tolist() if col not in columns_not_integers]

In [10]:
for column in numeric_columns:
    contrastive_all_data_df[column] = contrastive_all_data_df[column].astype(float)

In [11]:
contrastive_all_data_df.dtypes

index                                                      object
title                                                      object
last_name                                                  object
first_name                                                 object
date                                                      float64
num_acts                                                  float64
url                                                        object
num_present_characters                                    float64
num_scenes_text                                           float64
num_scenes_iarkho                                         float64
speech_distribution                                        object
percentage_monologues                                     float64
percentage_duologues                                      float64
percentage_non_duologues                                  float64
percentage_above_two_speakers                             float64
av_percent

#### Check the correlation coefficients between the average percentae of non-speakers and the coefficient of unused dramatic characters.

In [12]:
contrastive_all_data_df[['av_percentage_non_speakers', 'coefficient_unused']].corr()

Unnamed: 0,av_percentage_non_speakers,coefficient_unused
av_percentage_non_speakers,1.0,0.98278
coefficient_unused,0.98278,1.0


### Save the Data

In [13]:
contrastive_all_data_df.to_csv('../Contrastive_Material/Contrastive_Material_Data.csv', index=False)