## Appendix A: Validation Python Notebook

Input:
* /data/validation.csv
* /data/rater-icc-data.csv

Depends on:
* /data/validation-final.rds
* data_context @ targets

In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score
from sklearn.metrics import precision_recall_fscore_support, f1_score
from pingouin import intraclass_corr
import random
import itertools
import datetime

### Read in data 

In [3]:
df = pd.read_csv("data/validation.csv", encoding="ISO-8859-1")
print("Sample Size: " + str(df.shape[0]))

Sample Size: 300


### Base rates in human-coded tweets

In [4]:
print(df.binary.value_counts())
print(df.trinary.value_counts())

1    265
0     35
Name: binary, dtype: int64
 0    150
 1    115
-1     35
Name: trinary, dtype: int64


In [5]:
raters = pd.read_csv("data/rater-icc-data.csv", encoding="ISO-8859-1")

In [6]:
raters


Unnamed: 0.1,Unnamed: 0,r1_pos,r2_pos,r1_neg,r2_neg
0,1,1,1,1,1
1,2,1,1,1,2
2,3,1,1,1,1
3,4,2,2,1,1
4,5,3,3,1,1
...,...,...,...,...,...
65,66,1,1,2,2
66,67,2,2,1,1
67,68,3,2,1,1
68,69,4,3,1,1


### SentiStrength sentiment strength scales to human coder

In [7]:
# Addition: Evaluate overall scale
df["ss_scale"] = df.ss_pos - df.ss_neg
df["scale"] = df.pos - df.neg

In [8]:
df.pos

0      2
1      1
2      1
3      2
4      2
      ..
295    1
296    2
297    1
298    1
299    1
Name: pos, Length: 300, dtype: int64

In [9]:
def print_kappa(name, x, y): print(name+": "+str(round(cohen_kappa_score(x,y), 3)))
print_kappa("SentiStrength Pos", df.pos, df.ss_pos)
print_kappa("SentiStrength Neg", df.neg, df.ss_neg)
print_kappa("SentiStrength Overall", df.scale, df.ss_scale)

SentiStrength Pos: 0.301
SentiStrength Neg: 0.183
SentiStrength Overall: 0.27


In [10]:
np.corrcoef(df.pos, df.scale)

array([[1.        , 0.87315036],
       [0.87315036, 1.        ]])

In [11]:
def print_icc(a, b):
    df["row"]=list(range(1, df.shape[0]+1))
    df_icc_pos=df[["row", a, b]]
    df_icc_pos=pd.melt(df_icc_pos, id_vars=["row"], value_vars=[a, b], 
                       var_name="rater", value_name="res")
    icc=intraclass_corr(data=df_icc_pos, targets='row', raters='rater', 
                        ratings='res').round(3)
    print(icc, end="\n\n")

print_icc("pos", "ss_pos")
print_icc("neg", "ss_neg")
print_icc("scale", "ss_scale")

    Type              Description    ICC      F  df1  df2  pval         CI95%
0   ICC1   Single raters absolute  0.513  3.108  299  300   0.0  [0.42, 0.59]
1   ICC2     Single random raters  0.527  3.519  299  299   0.0   [0.4, 0.63]
2   ICC3      Single fixed raters  0.557  3.519  299  299   0.0  [0.47, 0.63]
3  ICC1k  Average raters absolute  0.678  3.108  299  300   0.0   [0.6, 0.74]
4  ICC2k    Average random raters  0.690  3.519  299  299   0.0  [0.57, 0.77]
5  ICC3k     Average fixed raters  0.716  3.519  299  299   0.0  [0.64, 0.77]

    Type              Description    ICC      F  df1  df2  pval         CI95%
0   ICC1   Single raters absolute  0.287  1.806  299  300   0.0  [0.18, 0.39]
1   ICC2     Single random raters  0.288  1.814  299  299   0.0  [0.18, 0.39]
2   ICC3      Single fixed raters  0.289  1.814  299  299   0.0  [0.18, 0.39]
3  ICC1k  Average raters absolute  0.446  1.806  299  300   0.0  [0.31, 0.56]
4  ICC2k    Average random raters  0.448  1.814  299  299   0.0

### Binary and trinary classfications

In [12]:
def print_eval_binary(name, x, y):
        
    print("*** Classification Evaluation for: " + name + " ***\n")
    cm = pd.DataFrame(
        confusion_matrix(x, y),
        index=['human:neg', 'human:pos'], 
        columns=['pred:neg', 'pred:pos']
    )
    print(cm, "\n")
    
    print('accuracy:\t{}'.format(round(accuracy_score(x, y) * 100, 2)), "\n")    

    precision, recall, fscore, support = precision_recall_fscore_support(x, y, 
                                                                 labels=[1,0])
    
    print('\t\tpos\tneg')
    
    print('precision:\t{}\t{}'.format(*[round(i, 2) for i in precision]))
    print('recall:\t\t{}\t{}'.format(*[round(i, 2) for i in recall]))
    print('fscore:\t\t{}\t{}'.format(*[round(i, 2) for i in fscore]))
    print('support:\t{}\t{}'.format(*[round(i, 2) for i in support]))
    print('')
    
print_eval_binary("SentiStrength", df.binary, df.ss_binary)
print_eval_binary("LIWC", df.binary, df.liwc_binary)
print_eval_binary("Tidytext", df.binary, df.tidytext_binary)
print_eval_binary("VADER", df.binary, df.vader_binary)

*** Classification Evaluation for: SentiStrength ***

           pred:neg  pred:pos
human:neg        14        21
human:pos        24       241 

accuracy:	85.0 

		pos	neg
precision:	0.92	0.37
recall:		0.91	0.4
fscore:		0.91	0.38
support:	265	35

*** Classification Evaluation for: LIWC ***

           pred:neg  pred:pos
human:neg         7        28
human:pos         7       258 

accuracy:	88.33 

		pos	neg
precision:	0.9	0.5
recall:		0.97	0.2
fscore:		0.94	0.29
support:	265	35

*** Classification Evaluation for: Tidytext ***

           pred:neg  pred:pos
human:neg        16        19
human:pos        20       245 

accuracy:	87.0 

		pos	neg
precision:	0.93	0.44
recall:		0.92	0.46
fscore:		0.93	0.45
support:	265	35

*** Classification Evaluation for: VADER ***

           pred:neg  pred:pos
human:neg        14        21
human:pos        14       251 

accuracy:	88.33 

		pos	neg
precision:	0.92	0.5
recall:		0.95	0.4
fscore:		0.93	0.44
support:	265	35



In [13]:
def print_eval_trinary(name, x, y):
        
    print("*** Classification Evaluation for: " + name + " ***\n")
    cm = pd.DataFrame(
        confusion_matrix(x, y),
        index=['human:neg', 'human:neutral', 'human:pos'],
        columns=['pred:neg', 'pred:neutral', 'pred:pos']
    )
    print(cm, "\n")
    
    print('accuracy:\t{}'.format(round(accuracy_score(x, y) * 100, 2)), "\n")    

    precision, recall, fscore, support = precision_recall_fscore_support(x, y, 
                                                              labels=[1,0,-1])

    print('\t\tpos\tneut\tneg')
    print('precision:\t{}\t{}\t{}'.format(*[round(i, 2) for i in precision]))
    print('recall:\t\t{}\t{}\t{}'.format(*[round(i, 2) for i in recall]))
    print('fscore:\t\t{}\t{}\t{}'.format(*[round(i, 2) for i in fscore]))
    print('support:\t{}\t{}\t{}'.format(*[round(i, 2) for i in support]), "\n")

print_eval_trinary("SentiStrength", df.trinary, df.ss_trinary)
print_eval_trinary("LIWC", df.trinary, df.liwc_trinary)
print_eval_trinary("Tidytext", df.trinary, df.tidytext_trinary)
print_eval_trinary("VADER", df.trinary, df.vader_trinary)

*** Classification Evaluation for: SentiStrength ***

               pred:neg  pred:neutral  pred:pos
human:neg            14            15         6
human:neutral        15            95        40
human:pos            10            16        89 

accuracy:	66.0 

		pos	neut	neg
precision:	0.66	0.75	0.36
recall:		0.77	0.63	0.4
fscore:		0.71	0.69	0.38
support:	115	150	35 

*** Classification Evaluation for: LIWC ***

               pred:neg  pred:neutral  pred:pos
human:neg             7            21         7
human:neutral         4           104        42
human:pos             3            22        90 

accuracy:	67.0 

		pos	neut	neg
precision:	0.65	0.71	0.5
recall:		0.78	0.69	0.2
fscore:		0.71	0.7	0.29
support:	115	150	35 

*** Classification Evaluation for: Tidytext ***

               pred:neg  pred:neutral  pred:pos
human:neg            16             2        17
human:neutral        13            50        87
human:pos             7             5       103 

accuracy:	56.33 



### Adding statistical tests to validation results

Reference: 
> Yeh, A. (2000). More accurate tests for the statistical significance of result differences. arXiv preprint cs/0008005.

Null Hypothesis: There is no difference in F-score between two methods

1. Take predictions of the methods
2. Randomly assign them to any of the two measures and observe the difference in F-score
3. Count how often that random difference is larger than the observed difference in F-score

In [14]:
def simulate_fs(df, reference, col1, col2, n, two_sided=True):
    print('Time:', datetime.datetime.now())
    if reference == 'binary':
        labels = [1, 0]
    elif reference == 'trinary':
        labels = [1, 0, -1]
    else:
        print('Please provice binary or trinary as reference.')
        return False
    if not two_sided:
        print('\nOne-sided randomization test of two F-scores, n =', n, '\n')
        print('H0: F-score of', col1, 'is equal or smaller than F-score of', col2)
        print('H1: F-score of', col1, 'is larger than F-score of', col2, end='\n\np: ')
    else:
        print('\nTwo-sided randomization test of two F-scores, n =', n, '\n')
        print('H0: F-score of', col1, 'is equal to F-score of', col2)
        print('H1: F-score of', col1, 'is not equal to F-score of', col2, end='\n\np: ')
    _, _, fscores1, _ = precision_recall_fscore_support(df[reference], df[col1], 
                                                        labels=labels)
    _, _, fscores2, _ = precision_recall_fscore_support(df[reference], df[col2], 
                                                        labels=labels)
    diffs = [a-b for a, b in zip(fscores1, fscores2)]
    diffsdict = dict(zip(labels, diffs))
    simulated = {}
    for label in labels: 
        simulated[label] = []
    for _ in range(n):
        shuffle1 = []
        shuffle2 = []
        for a, b in zip(df[col1], df[col2]):
            if random.randint(0, 1) == 1:
                shuffle1.append(a)
                shuffle2.append(b)
            else:
                shuffle1.append(b)
                shuffle2.append(a) 
        _, _, fscores1, _ = precision_recall_fscore_support(df[reference], shuffle1, 
                                                           labels=labels)
        _, _, fscores2, _ = precision_recall_fscore_support(df[reference], shuffle2, 
                                                           labels=labels)
        diffs = [a-b for a, b in zip(fscores1, fscores2)]
        for label, diff in zip(labels, diffs):
            simulated[label].append(diff)
    p = {}
    for label in labels:
        if not two_sided:
            overdiff = [d for d in simulated[label] if d >= diffsdict[label]]
        else:
            overdiff = [d for d in simulated[label] if abs(d) >= abs(diffsdict[label])]
        p[label] = len(overdiff)/n
    return p


for a, b in itertools.combinations(['ss_binary', 'liwc_binary', 'tidytext_binary', 
                        'vader_binary'], 2):
    print(simulate_fs(df, 'binary', a, b, 250000, two_sided=True), end='\n\n')
    
for a, b in itertools.combinations(['ss_trinary', 'liwc_trinary', 'tidytext_trinary', 
                        'vader_trinary'], 2):
    print(simulate_fs(df, 'trinary', a, b, 250000, two_sided=True), end='\n\n')

Time: 2021-04-22 12:52:57.666772

Two-sided randomization test of two F-scores, n = 250000 

H0: F-score of ss_binary is equal to F-score of liwc_binary
H1: F-score of ss_binary is not equal to F-score of liwc_binary

p: {1: 0.035936, 0: 0.225316}

Time: 2021-04-22 12:59:16.226497

Two-sided randomization test of two F-scores, n = 250000 

H0: F-score of ss_binary is equal to F-score of tidytext_binary
H1: F-score of ss_binary is not equal to F-score of tidytext_binary

p: {1: 0.3903, 0: 0.366376}

Time: 2021-04-22 13:05:50.533669

Two-sided randomization test of two F-scores, n = 250000 

H0: F-score of ss_binary is equal to F-score of vader_binary
H1: F-score of ss_binary is not equal to F-score of vader_binary

p: {1: 0.069604, 0: 0.340444}

Time: 2021-04-22 13:12:31.594536

Two-sided randomization test of two F-scores, n = 250000 

H0: F-score of liwc_binary is equal to F-score of tidytext_binary
H1: F-score of liwc_binary is not equal to F-score of tidytext_binary

p: {1: 0.31972,

### System info

In [15]:
from sinfo import sinfo
sinfo()

-----
numpy       1.20.2
pandas      1.2.4
pingouin    0.3.11
sinfo       0.3.1
sklearn     0.24.1
-----
IPython             7.22.0
jupyter_client      6.1.12
jupyter_core        4.7.1
notebook            6.3.0
-----
Python 3.8.5 (default, Jan 27 2021, 15:41:15) [GCC 9.3.0]
Linux-5.8.0-50-generic-x86_64-with-glibc2.29
16 logical CPU cores, x86_64
-----
Session information updated at 2021-04-22 14:12
