# Comparative analyses between annotation instructions
The original paper by Sheng et al. (2020) used several fallback options while this work only used
 one (and generally, slightly different instructions). The analyses here evaluate what works best.

In [1]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style="white")

In [2]:
!pip install openpyxl
data_dir = "../data/annotated_data_raw/crowd_sourced_regard_w_unmodified_annotations"
original = "../data/raw_study_data/all_with_label.csv"
original_df = pd.read_csv(original)
out_path = "../plots"
os.makedirs(out_path, exist_ok=True)

You should consider upgrading via the '/home/angelie/Documents/University/Thesis/bias-in-german-nlg/venv/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
annotations = pd.DataFrame()
annotator_names =  []
for i, annotation in enumerate(os.listdir(data_dir)):
    annotator = annotation.split('_')[-1].split('.')[0]
    annotator_names += [annotator]
    data = pd.read_excel(os.path.join(data_dir, annotation), index_col=0)
    if i == 0:
        annotations["Text"] = data["Text"]
    for col in data.columns:
        if col != "Text" and not "Unnamed" in col:
            annotations[f"{col}_{annotator}"] = data[col].apply(lambda x: 1 if x is not np.nan
            else 0)

In [4]:
# Col names
posneg = "Pos&Neg"
posneg_cols = [col for col in annotations.columns if posneg in col]
ansicht = "Ansichtssache"
ansicht_cols = [col for col in annotations.columns if ansicht in col]
nonsense = "Nonsense"
nonsense_cols = [col for col in annotations.columns if nonsense in col]
pos = "Positiv"
neg = "Negativ"
neu = "Neutral"
valences = [neg, neu, pos]

In [5]:
annotations[nonsense_cols].sum()

Nonsense_2b     11
Nonsense_1b      5
Nonsense_4b     31
Nonsense_3b     13
Nonsense_5b    293
dtype: int64

In [6]:
mult_unsure = annotations.loc[np.sum(annotations[posneg_cols] == 1, axis=1) > 3,
                           nonsense_cols].index
mult_unsure
# Drop 178, 591 -> 4 people say nonsense

Int64Index([85, 390], dtype='int64')

In [7]:
mult_ansicht = annotations.loc[np.sum(annotations[ansicht_cols] == 1, axis=1) > 3,
                           ansicht_cols].index
annotations.loc[mult_ansicht, 'Text']
# Political opinions

176    Die Person wurde beschrieben als ein Anhänger ...
177    Die Person hatte eine Stelle angefangen als Kl...
805    Die Person war eine links-grün versiffte Sozia...
Name: Text, dtype: object

In [8]:
mult_posneg = annotations.loc[np.sum(annotations[posneg_cols] == 1, axis=1) > 3,
                           posneg_cols].index
annotations.loc[mult_posneg, 'Text']

85     Die Person wurde beschrieben als langweilig ab...
390    Die Person wurde beschrieben als nicht besonde...
Name: Text, dtype: object

In [9]:
nonbin_annotations = pd.DataFrame()
nonbin_annotations["Text"] = annotations["Text"]
for a in annotator_names:
    nonbin_annotations[a] = np.where(annotations[f"{neg}_{a}"] == 1, 0,
                                     np.where(annotations[f"{neu}_{a}"] == 1, 1,
                                              np.where(annotations[f"{pos}_{a}"] == 1, 2, 99)))
nonbin_annotations

Unnamed: 0,Text,2b,1b,4b,3b,5b
0,Die Person arbeitete als Ballfänger bei einem ...,1,2,1,1,1
1,Die Person verdiente Geld durch die anstrengen...,1,2,99,2,2
2,Die Person war sehr bekannt dafür häufig herzl...,2,2,2,2,2
3,Die Person wurde beschrieben als ruhiger Zeitg...,2,2,2,2,2
4,Die Person hatte eine Stelle angefangen als Ba...,1,1,1,2,1
...,...,...,...,...,...,...
1153,Die Person ist ein Dudelsackspieler.,1,1,1,2,1
1154,Die Person arbeitet als Sekretärin.,1,1,1,2,1
1155,Die Person ist müde und erschöpft.,1,1,1,1,99
1156,Die Person folgt dem Kurs der kommunistischen ...,99,99,99,0,1


In [10]:
nonbin_annotations_all = pd.DataFrame()
nonbin_annotations_all["Text"] = annotations["Text"]
for a in annotator_names:
    nonbin_annotations_all[a] = np.where(annotations[f"{neg}_{a}"] == 1, 0,
                                     np.where(annotations[f"{neu}_{a}"] == 1, 1,
                                              np.where(annotations[f"{pos}_{a}"] == 1, 2,
                                                       np.where(annotations[f"Pos&Neg_{a}"] == 1, 3,
                                                                np.where
                                                                (annotations[f"Ansichtssache_{a}"]
                                                                 == 1, 4, np.where
                                                                (annotations[f"Nonsense_{a}"] == 1,
                                                                 5, 99))))))
nonbin_annotations_all

Unnamed: 0,Text,2b,1b,4b,3b,5b
0,Die Person arbeitete als Ballfänger bei einem ...,1,2,1,1,1
1,Die Person verdiente Geld durch die anstrengen...,1,2,4,2,2
2,Die Person war sehr bekannt dafür häufig herzl...,2,2,2,2,2
3,Die Person wurde beschrieben als ruhiger Zeitg...,2,2,2,2,2
4,Die Person hatte eine Stelle angefangen als Ba...,1,1,1,2,1
...,...,...,...,...,...,...
1153,Die Person ist ein Dudelsackspieler.,1,1,1,2,1
1154,Die Person arbeitet als Sekretärin.,1,1,1,2,1
1155,Die Person ist müde und erschöpft.,1,1,1,1,5
1156,Die Person folgt dem Kurs der kommunistischen ...,4,4,4,0,1


In [11]:
from nltk import agreement

def fleiss_kappa(data, a_names):
    formatted_codes = []

    for j, annotator in enumerate(a_names):
            formatted_codes += [[j, i, val] for i, val in enumerate(data[annotator])]

    ratingtask = agreement.AnnotationTask(data=formatted_codes)

    print('Fleiss\' Kappa:',ratingtask.multi_kappa())


print("only valence")
fleiss_kappa(nonbin_annotations, annotator_names)
print("all labels")
fleiss_kappa(nonbin_annotations_all, annotator_names)

only valence
Fleiss' Kappa: 0.45043505493238056
all labels
Fleiss' Kappa: 0.44339087763201634


In [12]:
from sklearn.metrics import cohen_kappa_score

def get_all_pairwise_kappas(data, a_names, anonymize=True):
    a_names_cl = a_names
    if anonymize:
        a_names = [f'Annotator_{i}' for i, _ in enumerate(a_names)]
    results = pd.DataFrame()
    for i, a in enumerate(a_names):
        for j, b in enumerate(a_names):
            results.loc[a, b] = cohen_kappa_score(data[a_names_cl[i]], data[a_names_cl[j]])
    return results

print("only valence")
kappa = get_all_pairwise_kappas(nonbin_annotations, annotator_names, anonymize=True)
print("Avg Cohen", kappa.mean().mean())
print("all labels")
kappa_all = get_all_pairwise_kappas(nonbin_annotations_all, annotator_names, anonymize=True)
print("Avg Cohen", kappa_all.mean().mean())
kappa_all

only valence
Avg Cohen 0.5639840445523899
all labels
Avg Cohen 0.5589995346907355


Unnamed: 0,Annotator_0,Annotator_1,Annotator_2,Annotator_3,Annotator_4
Annotator_0,1.0,0.508182,0.591697,0.397085,0.420377
Annotator_1,0.508182,1.0,0.584166,0.552251,0.322981
Annotator_2,0.591697,0.584166,1.0,0.483375,0.372083
Annotator_3,0.397085,0.552251,0.483375,1.0,0.255298
Annotator_4,0.420377,0.322981,0.372083,0.255298,1.0


In [13]:
annotations_w_orig = nonbin_annotations_all
annotations_w_orig["Original"] = original_df["label"]
kappa_all = get_all_pairwise_kappas(annotations_w_orig,
                                    annotator_names+["Original"],
                                    anonymize=True)
print("Avg Cohen", kappa_all.mean().mean())
kappa_all

Avg Cohen 0.3880501816955147


Unnamed: 0,Annotator_0,Annotator_1,Annotator_2,Annotator_3,Annotator_4,Annotator_5
Annotator_0,1.0,0.508182,0.591697,0.397085,0.420377,-0.107877
Annotator_1,0.508182,1.0,0.584166,0.552251,0.322981,-0.106383
Annotator_2,0.591697,0.584166,1.0,0.483375,0.372083,-0.129408
Annotator_3,0.397085,0.552251,0.483375,1.0,0.255298,-0.112623
Annotator_4,0.420377,0.322981,0.372083,0.255298,1.0,-0.0463
Annotator_5,-0.107877,-0.106383,-0.129408,-0.112623,-0.0463,1.0


In [14]:
def all_equal(iterator):
    iterator = iter(iterator)
    try:
        first = next(iterator)
    except StopIteration:
        return True
    return all(first == x for x in iterator)

not_all_equal_idcs = []
for i, row in nonbin_annotations[annotator_names].iterrows():
    e = all_equal(row)
    if e is False:
        not_all_equal_idcs += [i]

In [15]:
print('Number of cases where annotators were of different opinion')
len(nonbin_annotations.loc[not_all_equal_idcs, :])



Number of cases where annotators were of different opinion


810