# Description

In [2]:
# -*- coding: utf-8 -*-
"""This code computes the coefficient of variation (CV) and some other stats for small samples (indicated by the * added to CV) 
for a given set of measurements which are assumed to be for the same or similar object, using the same measurand. 
Stats are adjusted for small sample size. Paper ref: Belz, Popovic & Mille (2022) Quantified Reproducibility Assessment of NLP Results,
ACL'22.

In this self-contained version, the set of measurements on which CV is computed is assigned to the variable set_of_set_of_measurements
(see examples in code below).

The reproducibility stats reported in the output are: 
* the unbiased coefficient of variation
* the sample mean
* the unbiased sample standard deviation with 95% confidence intervals, estimated on the basis of the standard error of the unbiassed sample variance
* the sample size
* the percentage of measured valued within two standard deviations
* the percentage of measured valued within one standard deviation

Example narrative output:

The unbiased coefficient of variation is 1.5616560359100269 \
for a mean of 85.58285714285714 , \
unbiased sample standard deviation of 1.2904233075765223 with 95\% CI (0.4514829817654973, 2.1293636333875474) ,\
and a sample size of 7 . \
100.0 % of measured values fall within two standard deviations. \
71.429 % of measured values fall within one standard deviation. 

NOTE:
* CV assumes all measurements are positive; if they're not, shift measurement scale to start at 0
* for fair comparison across studies, measurements on a scale that doesn't start at 0 need to be shifted to a scale that does start at 0 

KNOWN ISSUES:

none
"""

"This code computes the coefficient of variation (CV) and some other stats for small samples (indicated by the * added to CV) \nfor a given set of measurements which are assumed to be for the same or similar object, using the same measurand. \nStats are adjusted for small sample size. Paper ref: Belz, Popovic & Mille (2022) Quantified Reproducibility Assessment of NLP Results,\nACL'22.\n\nIn this self-contained version, the set of measurements on which CV is computed is assigned to the variable set_of_set_of_measurements\n(see examples in code below).\n\nThe reproducibility stats reported in the output are: \n* the unbiased coefficient of variation\n* the sample mean\n* the unbiased sample standard deviation with 95% confidence intervals, estimated on the basis of the standard error of the unbiassed sample variance\n* the sample size\n* the percentage of measured valued within two standard deviations\n* the percentage of measured valued within one standard deviation\n\nExample narrativ

## Import Statements

In [3]:
import math
import numpy as np
from scipy.stats import t

In [19]:

def calculateType3Results(set_of_set_of_measurements):

    for set_of_measurements in set_of_set_of_measurements:
        if len(set_of_measurements) < 2:
            print(set_of_measurements, ": set of measurements is smaller than 2")
            break

        sample_mean = np.mean(set_of_measurements)
        if sample_mean <= 0:
            print(set_of_measurements, ": mean is 0 or negative")
            break

        sample_size = len(set_of_measurements)
        degrees_of_freedom = sample_size-1
        sum_of_squared_differences = np.sum(np.square(sample_mean-set_of_measurements))

        # unbiassed sample variance s^2
        unbiassed_sample_variance = sum_of_squared_differences/degrees_of_freedom
        # corrected sample standard deviation s
        corrected_sample_standard_deviation = np.sqrt(unbiassed_sample_variance)
        # Gamma(N/2)
        gamma_N_over_2 = math.gamma(sample_size/2)
        # Gamma((N-1)/2)
        gamma_df_over_2 = math.gamma(degrees_of_freedom/2)
        # c_4(N)
        c_4_N = math.sqrt(2/degrees_of_freedom)*gamma_N_over_2/gamma_df_over_2
        # unbiassed sample std dev s/c_4
        unbiassed_sample_std_dev_s_c_4 = corrected_sample_standard_deviation/c_4_N
        # standard error of the unbiassed sample variance (assumes normally distributed population)
        standard_error_of_unbiassed_sample_variance = unbiassed_sample_variance*np.sqrt(2/degrees_of_freedom)
        # estimated std err of std dev based on std err of unbiassed sample variance
        est_SE_of_SD_based_on_SE_of_unbiassed_sample_variance = standard_error_of_unbiassed_sample_variance/(2*unbiassed_sample_std_dev_s_c_4)

        # COEFFICIENT OF VARIATION CV
        coefficient_of_variation = (unbiassed_sample_std_dev_s_c_4/sample_mean)*100
        # SMALL SAMPLE CORRECTED COEFFICIENT OF VARIATION CV*
        small_sample_coefficient_of_variation = (1+(1/(4*sample_size)))*coefficient_of_variation

        # compute percentage of measured values within 1 and 2 standard deviations from the mean
        # initialise counts
        count_within_1_sd = 0
        count_within_2_sd = 0
        # for each measured value
        for m in set_of_measurements:
            # if it's within two std devs, increment count_within_2_sd
            if np.abs(m-sample_mean) < 2*unbiassed_sample_std_dev_s_c_4:
                count_within_2_sd += 1
            #if it's also within one std devs, increment count_within_1_sd
            if np.abs(m-sample_mean) < unbiassed_sample_std_dev_s_c_4:
                count_within_1_sd += 1


        print("*************************************************************************")
        print("measurement used -> ",set_of_measurements)
        # report results as described in code description above
        print("The unbiased coefficient of variation is",small_sample_coefficient_of_variation)
        print("for a mean of",sample_mean,", ")
        print("unbiased sample standard deviation of",unbiassed_sample_std_dev_s_c_4,", with 95\% CI",t.interval(0.95, degrees_of_freedom, loc=unbiassed_sample_std_dev_s_c_4, scale=est_SE_of_SD_based_on_SE_of_unbiassed_sample_variance),",")
        print("and a sample size of",sample_size,".")
        print(count_within_2_sd/sample_size*100,"% of measured values fall within two standard deviations.")
        print(round(count_within_1_sd/sample_size*100, 3),"% of measured values fall within one standard deviation.", )



In [25]:

# --- Table 4: number of labels for 5 different labels---
set_of_set_of_measurements = [[565, 626],
                             [138,162],
                             [70,144],
                             [398,327],
                             [177,89]]

calculateType3Results(set_of_set_of_measurements)

# Positive, 565 626 10.2128
# Negative, 138 162 15.9520 
# Mixed feelings, 70 144 68.9517
# Neutral, 398 327 19.5275
# Non-Malayalam, 177 89 65.9672


*************************************************************************
measurement used ->  [565, 626]
The unbiased coefficient of variation is 10.212816584247777
for a mean of 595.5 , 
unbiased sample standard deviation of 54.05984245261823 , with 95\% CI (-255.15173740816385, 363.2714223134003) ,
and a sample size of 2 .
100.0 % of measured values fall within two standard deviations.
100.0 % of measured values fall within one standard deviation.
*************************************************************************
measurement used ->  [138, 162]
The unbiased coefficient of variation is 15.952084658149639
for a mean of 150.0 , 
unbiased sample standard deviation of 21.269446210866185 , with 95\% CI (-100.38756881632679, 142.92646123805918) ,
and a sample size of 2 .
100.0 % of measured values fall within two standard deviations.
100.0 % of measured values fall within one standard deviation.
*************************************************************************
measurement us

  est_SE_of_SD_based_on_SE_of_unbiassed_sample_variance = standard_error_of_unbiassed_sample_variance/(2*unbiassed_sample_std_dev_s_c_4)


In [21]:
# --- Table 8: Reimplemented LR on Original Corpus VS Reannotated Corpus as measured by F1-Score
set_of_set_of_measurements = [[0.28, 0.14], #
                             [0.49,0.45], #
                             [0.63,0.61], #
                             [0.61,0.45], #
                             [0.72,0.76], #
                             [0.54,0.48], #
                             [0.63,0.60]] #

calculateType3Results(set_of_set_of_measurements)


# Mixed feelings, 0.28 & 0.14 & 66.4670 \\
# Negative, 0.49, 0.45, 8.4851 
# Neutral, 0.63, 0.61, 3.2161
# Non-Malayalam, 0.61, 0.45, 30.0982
# Positive, 0.72, 0.76, 5.3892
# macro avg, 0.54, 0.48, 11.7294
# weighted avg, 0.63, 0.6, 4.8634


*************************************************************************
measurement used ->  [0.28, 0.14]
The unbiased coefficient of variation is 66.46701940895684
for a mean of 0.21000000000000002 , 
unbiased sample standard deviation of 0.12407176956338611 , with 95\% CI (-0.5855941514285727, 0.8337376905553451) ,
and a sample size of 2 .
100.0 % of measured values fall within two standard deviations.
100.0 % of measured values fall within one standard deviation.
*************************************************************************
measurement used ->  [0.49, 0.45]
The unbiased coefficient of variation is 8.48515141390938
for a mean of 0.47 , 
unbiased sample standard deviation of 0.035449077018110293 , with 95\% CI (-0.1673126146938779, 0.23821076873009847) ,
and a sample size of 2 .
100.0 % of measured values fall within two standard deviations.
100.0 % of measured values fall within one standard deviation.
********************************************************************

In [22]:
# --- Table 8: Reimplemented BERT on Original Corpus VS Reannotated Corpus as measured by F1-Score
set_of_set_of_measurements = [[0.43, 0.29], #
                             [0.59,0.54], #
                             [0.68,0.57], #
                             [0.78,0.46], #
                             [0.78,0.75], #
                             [0.65,0.52], #
                             [0.71,0.61]]#

calculateType3Results(set_of_set_of_measurements)


# Mixed feelings, 0.43, 0.29, 38.7724
# Negative, 0.59, 0.54, 8.8230 
# Neutral, 0.68, 0.57, 17.5472
# Non-Malayalam, 0.78, 0.46, 51.4583
# Positive, 0.78, 0.75, 3.9098
# macro avg, 0.65, 0.52, 22.1556
# weighted avg, 0.71, 0.61, 15.1061


*************************************************************************
measurement used ->  [0.43, 0.29]
The unbiased coefficient of variation is 38.772427988558164
for a mean of 0.36 , 
unbiased sample standard deviation of 0.12407176956338611 , with 95\% CI (-0.5855941514285727, 0.8337376905553451) ,
and a sample size of 2 .
100.0 % of measured values fall within two standard deviations.
100.0 % of measured values fall within one standard deviation.
*************************************************************************
measurement used ->  [0.59, 0.54]
The unbiased coefficient of variation is 8.82305567375532
for a mean of 0.565 , 
unbiased sample standard deviation of 0.04431134627263783 , with 95\% CI (-0.20914076836734713, 0.2977634609126228) ,
and a sample size of 2 .
100.0 % of measured values fall within two standard deviations.
100.0 % of measured values fall within one standard deviation.
*************************************************************************
measurem

In [23]:
# --- Table 8: Original vs Reimplemented LR on Original Corpus as measured by F1-Score
set_of_set_of_measurements = [[0.33, 0.28], #
                             [0.55,0.49], #
                             [0.65,0.63], #
                             [0.63,0.61], #
                             [0.75,0.72], #
                             [0.58,0.54], #
                             [0.66,0.63]]#

calculateType3Results(set_of_set_of_measurements)

# Mixed feelings, 0.33, 0.28, 16.3443
# Negative, 0.55, 0.49, 11.5039 
# Neutral, 0.65, 0.63, 3.1156
# Non-Malayalam, 0.63, 0.61, 3.2161
# Positive, 0.75, 0.72, 4.0694
# macro avg, 0.58, 0.54, 7.1214
# weighted avg, 0.66, 0.63, 4.6372



*************************************************************************
measurement used ->  [0.33, 0.28]
The unbiased coefficient of variation is 16.34434903498938
for a mean of 0.30500000000000005 , 
unbiased sample standard deviation of 0.044311346272637886 , with 95\% CI (-0.2091407683673473, 0.29776346091262307) ,
and a sample size of 2 .
100.0 % of measured values fall within two standard deviations.
100.0 % of measured values fall within one standard deviation.
*************************************************************************
measurement used ->  [0.55, 0.49]
The unbiased coefficient of variation is 11.503907205396384
for a mean of 0.52 , 
unbiased sample standard deviation of 0.05317361552716551 , with 95\% CI (-0.2509689220408172, 0.3573161530951482) ,
and a sample size of 2 .
100.0 % of measured values fall within two standard deviations.
100.0 % of measured values fall within one standard deviation.
******************************************************************

In [24]:
# --- Table 8: Original vs Reimplemetned BERT on Original Corpus as measured by F1-Score
set_of_set_of_measurements = [[0.00, 0.43], #
                             [0.56,0.59], #
                             [0.76,0.68], #
                             [0.90,0.78], #
                             [0.85,0.78], #
                             [0.61,0.65], #
                             [0.75,0.71]]#

calculateType3Results(set_of_set_of_measurements)


# Mixed feelings, 0.0, 0.43, 199.4010
# Negative, 0.56, 0.59, 5.2017 
# Neutral, 0.76, 0.68, 11.0778
# Non-Malayalam, 0.9, 0.78, 14.2429
# Positive, 0.85, 0.78, 8.5632
# macro avg, 0.61, 0.65, 6.3301
# weighted avg, 0.75, 0.71, 5.4630


*************************************************************************
measurement used ->  [0.0, 0.43]
The unbiased coefficient of variation is 199.40105822687048
for a mean of 0.215 , 
unbiased sample standard deviation of 0.38107757794468583 , with 95\% CI (-1.798610607959188, 2.5607657638485595) ,
and a sample size of 2 .
100.0 % of measured values fall within two standard deviations.
100.0 % of measured values fall within one standard deviation.
*************************************************************************
measurement used ->  [0.56, 0.59]
The unbiased coefficient of variation is 5.201766736353131
for a mean of 0.575 , 
unbiased sample standard deviation of 0.026586807763582663 , with 95\% CI (-0.1254844610204081, 0.17865807654757343) ,
and a sample size of 2 .
100.0 % of measured values fall within two standard deviations.
100.0 % of measured values fall within one standard deviation.
*************************************************************************
measure