## Evolution of the French Five-Act Comedy in Verse
Here, we re-run the notebook to get rounded summary statistics from a previous analysis.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import shapiro

In [2]:
# set the boundaries as we determined based on our analysis of a 10% sample
def determine_period(row):
    if row <= 1650:
        period = 1
    elif row >= 1651 and row <= 1695:
        period = 2
    elif row >= 1696 and row <= 1795:
        period = 3
    else:
        period = 4
    
    return period

In [3]:
def run_tests(test, feature):
    """
    The function allows us to run a statistical test of our choice on the adjacent periods.
    Params:
        test - a test of our choice, e.g., mannwhitneyu.
        feature - str, name of the feature we want to test on.
    Returns:
        no retun, prints the result of each test
    """
    result_one = test(period_one[feature], 
             period_two[feature]) 
    print('Period one and two:', '\n', feature, result_one)
    
    result_two = test(period_two[feature], 
             period_three[feature]) 
    print('Period two and three:', '\n', feature, result_two)
    
    result_three = test(period_three[feature], 
             period_four[feature]) 
    print('Period three and four:', '\n', feature, result_three) 

In [4]:
def summary(feature):
    mean = feature.mean()
    std = feature.std()
    median = feature.median()
    
    return mean, std, median

In [5]:
def make_plot(feature, title):
    mean, std, median = summary(feature)
    plt.figure(figsize=(10, 7))
    plt.title(title, fontsize=17)
    sns.distplot(feature, kde=False)
    mean_line = plt.axvline(mean, 
                            color='black',
                            linestyle='solid', 
                            linewidth=2); M1 = 'Mean';
    median_line = plt.axvline(median, 
                               color='green',linestyle='dashdot', 
                               linewidth=2); M2='Median'
    std_line = plt.axvline(mean + std, 
                           color='black',
                           linestyle='dashed', 
                           linewidth=2); M3 = 'Standard deviation';
    plt.axvline(mean - std, 
                color='black',
                linestyle='dashed', 
                linewidth=2)

    plt.legend([mean_line, median_line, std_line], [M1, M2, M3])
    plt.show()

In [6]:
# read the data
data = pd.read_csv('../French_Comedies/Data/French_Comedies_Data.csv')

In [7]:
data.shape

(277, 25)

In [8]:
# read the sample data
sample_df = pd.read_csv('../French_Comedies/Data/French_Comedies_Data_Sample.csv')

In [9]:
# exclude the comedies used for the sample analysis
not_sample = data[~data['index'].isin(sample_df['index'])].copy()

In [10]:
not_sample.shape

(248, 25)

In [11]:
not_sample.columns

Index(['index', 'title', 'last_name', 'first_name', 'date',
       'translation/adaptation/contrastive', 'num_acts', 'url',
       'num_present_characters', 'num_scenes_text', 'num_scenes_iarkho',
       'speech_distribution', 'percentage_monologues', 'percentage_duologues',
       'percentage_non_duologues', 'percentage_above_two_speakers',
       'av_percentage_non_speakers', 'sigma_iarkho',
       'number_scenes_with_discontinuous_change_characters',
       'percentage_scenes_with_discontinuous_change_characters',
       'total_utterances', 'num_verse_lines', 'dialogue_vivacity',
       'five_year_intervals', 'decades'],
      dtype='object')

In [12]:
# include only five act comedies and only the comedies that are not translations/adaptations 
original_comedies = not_sample[(not_sample['num_acts'] ==5)&
                               (not_sample['translation/adaptation/contrastive'] == 0)].copy()

In [13]:
original_comedies.head()

Unnamed: 0,index,title,last_name,first_name,date,translation/adaptation/contrastive,num_acts,url,num_present_characters,num_scenes_text,...,percentage_above_two_speakers,av_percentage_non_speakers,sigma_iarkho,number_scenes_with_discontinuous_change_characters,percentage_scenes_with_discontinuous_change_characters,total_utterances,num_verse_lines,dialogue_vivacity,five_year_intervals,decades
0,F_3,Mélite ou Les fausses lettres,Corneille,Pierre,1629,0,5,http://www.theatre-classique.fr/pages/document...,8,35,...,23.08,0.513,0.906,12,30.769,483.0,1822.0,0.265,1630,1630
1,F_5,La Veuve ou Le Traître trahi,Corneille,Pierre,1633,0,5,http://www.theatre-classique.fr/pages/document...,12,40,...,20.0,3.519,1.062,12,26.667,521.0,2010.0,0.259,1635,1640
4,F_11,La Galerie du Palais,Corneille,Pierre,1634,0,5,http://www.theatre-classique.fr/pages/document...,12,53,...,24.53,0.0,1.148,10,18.868,517.0,1794.0,0.288,1635,1640
5,F_12,La Suivante,Corneille,Pierre,1634,0,5,http://www.theatre-classique.fr/pages/document...,10,47,...,10.64,0.0,0.753,11,23.404,410.0,1700.0,0.241,1635,1640
6,F_7,Les Vendanges de suresnes,du Ryer,Pierre,1635,0,5,http://www.theatre-classique.fr/pages/document...,11,36,...,43.59,1.923,1.476,13,33.333,589.0,1837.0,0.321,1635,1640


In [14]:
original_comedies.shape

(228, 25)

In [15]:
# sort by date
sorted_comedies = original_comedies.sort_values(by='date')

In [16]:
# create time periods based on our hypothesized periodization
sorted_comedies['period'] = sorted_comedies['date'].apply(determine_period)

In [17]:
# rename column names for clarity
sorted_comedies = sorted_comedies.rename(columns={'num_scenes_iarkho': 'mobility_coefficient', 
                                                 'percentage_non_duologues': 'percentage_non_dialogues',
                                                  'percentage_above_two_speakers': 'percentage_polylogues'})

In [18]:
# define the features we want to analyze
features = ['num_present_characters', 
            'mobility_coefficient',
            'sigma_iarkho',
            'percentage_monologues', 
            'percentage_non_dialogues', 
            'percentage_polylogues']

## Updated Periodization: Three Periods

- Period one: from 1629 to 1695
- Period two: from 1696 to 1795
- Period three: from 1796 to 1849

In [19]:
# update the boundaries as we determined based on our hypothesis testing
def determine_period(row):
    if row <= 1695:
        period = 1
    elif row >= 1696 and row <= 1795:
        period = 2
    else:
        period = 3
    
    return period

In [20]:
# update our periodization accordingly
sorted_comedies['period'] = sorted_comedies['date'].apply(determine_period)

Descriptive Statistics for Each Period

### Number of Dramatic Characters

In [21]:
sorted_comedies.groupby('period').describe().loc[:, 'num_present_characters'][['mean', 'std', '50%','min', 'max']].round(2)

Unnamed: 0_level_0,mean,std,50%,min,max
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,11.81,3.62,11.0,7.0,22.0
2,10.64,3.0,10.0,6.0,24.0
3,11.96,4.26,11.0,6.0,28.0


### Mobility Coefficient

In [22]:
sorted_comedies.groupby('period').describe().loc[:, 'mobility_coefficient'][['mean', 'std', '50%','min', 'max']].round(2)

Unnamed: 0_level_0,mean,std,50%,min,max
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,41.82,10.75,40.0,19.0,85.0
2,49.56,11.65,49.0,29.0,91.0
3,54.31,9.96,53.5,34.0,86.0


### Standard Range of the Number of Speaking Characters (Sigma)

In [23]:
sorted_comedies.groupby('period').describe().loc[:, 'sigma_iarkho'][['mean', 'std', '50%','min', 'max']].round(2)

Unnamed: 0_level_0,mean,std,50%,min,max
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.23,0.33,1.19,0.6,2.15
2,1.12,0.3,1.1,0.63,1.98
3,1.33,0.33,1.26,0.84,2.54


### The Percentage of Non-Dialogues

In [24]:
sorted_comedies.groupby('period').describe().loc[:, 'percentage_non_dialogues'][['mean', 'std', '50%','min', 'max']].round(2)

Unnamed: 0_level_0,mean,std,50%,min,max
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,56.11,10.0,55.77,33.33,89.66
2,53.5,6.76,53.49,34.0,67.69
3,58.12,5.89,57.92,47.06,80.0


### The Percentage of Polylogues

In [25]:
sorted_comedies.groupby('period').describe().loc[:, 'percentage_polylogues'][['mean', 'std', '50%','min', 'max']].round(2)

Unnamed: 0_level_0,mean,std,50%,min,max
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,42.81,14.36,41.46,10.64,86.21
2,31.37,10.79,31.58,8.7,58.06
3,37.69,8.49,36.83,18.46,65.0


###  The Percentage of Monologues

In [26]:
sorted_comedies.groupby('period').describe().loc[:, 'percentage_monologues'][['mean', 'std', '50%','min', 'max']].round(2)

Unnamed: 0_level_0,mean,std,50%,min,max
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,13.18,9.39,12.07,0.0,32.43
2,22.08,7.62,22.03,3.23,40.48
3,20.35,5.97,19.58,11.36,43.08
