# Annotator Agreement

In [91]:
import pandas as pd

anno_file = '../data/review_classification_annotations.tsv.gz'
annos = pd.read_csv(anno_file, sep='\t', compression='gzip')

## Agreement




| A | Yes |	No |
| Yes | a | b |
| No | c | d |

Observed agreement:

![](https://wikimedia.org/api/rest_v1/media/math/render/svg/1757fb7677fea46fca2bf2d0924f91219dfe563a)

Chance of both saying yes:

![](https://wikimedia.org/api/rest_v1/media/math/render/svg/14d9bf3d72f58a76d78189d508fd80ac83e5f94e)

Chance of both saying No:

![](https://wikimedia.org/api/rest_v1/media/math/render/svg/d8fb5669567629ba2010e67b2a16bb1873cf91e4)

Chance of random agreement:

$p_e = p_Y + p_N$

Kappa:

![](https://wikimedia.org/api/rest_v1/media/math/render/svg/1a7a8f738ea187a60443bac93cd53b4d9d6b2231)

In [92]:
annos[['review_id', 'sent_num', 'annotator', 'CAT__Content']].sort_values(['review_id', 'sent_num'])


Unnamed: 0,review_id,sent_num,annotator,CAT__Content
0,impfic-review-10274,1,anno1,0.0
1,impfic-review-10274,1,anno2,0.0
2,impfic-review-10274,1,anno3,0.0
3,impfic-review-10274,2,anno1,1.0
4,impfic-review-10274,2,anno2,1.0
...,...,...,...,...
34897,impfic-review-84086,7,anno2,0.0
34898,impfic-review-84086,7,anno3,0.0
34899,impfic-review-84086,8,anno1,0.0
34900,impfic-review-84086,8,anno2,0.0


### Frequency of occurrence of categories

In [93]:
annos.columns


Index(['review_id', 'sent_id', 'sent_offset', 'sent_end', 'sent_text',
       'annotator', 'sent_num', 'CAT__Author', 'CAT__Content',
       'CAT__Recommendations', 'CAT__Other_works', 'CAT__Classification',
       'CAT__Reader_response', 'CAT__Style', 'CAT__Content--Narrative',
       'CAT__Content--Other', 'CAT__Content--Quote', 'CAT__Content--Theme',
       'CAT__Reader_response--Evaluation_of_quality',
       'CAT__Reader_response--Feelings',
       'CAT__Reader_response--Identification_and_immersion',
       'CAT__Reader_response--Reading_Context',
       'CAT__Reader_response--Reception', 'CAT__Reader_response--Reflection',
       'CAT__Style--Context', 'CAT__Style--Structure',
       'CAT__Style--Stylistic_features', 'CAT__None', 'num_cats'],
      dtype='object')

In [94]:
maincat_fields = [col for col in annos.columns if col.startswith('CAT__') and '--' not in col]
subcat_fields = [col for col in annos.columns if col.startswith('CAT__') and '--' in col]
allcat_fields = maincat_fields + subcat_fields


In [95]:
from mapping import map_cat_string, cat_tuples


In [96]:
annotators = list(annos.annotator.unique())

occur_freq = annos.groupby('annotator')[allcat_fields].sum().T

occur_freq = occur_freq.reset_index().rename(columns={'index': 'category'})

# separate main category and sub-category
occur_freq['main_cat'], occur_freq['sub_cat'] = zip(*occur_freq.category.apply(map_cat_string))

# remove invalid main and sub category combinations
occur_freq = occur_freq[occur_freq.apply(lambda row: (row['main_cat'], row['sub_cat']) in cat_tuples, axis=1)]


occur_freq[occur_freq.sub_cat.isna()][['main_cat'] + annotators]

annotator,main_cat,anno1,anno2,anno3
0,Author,1299.0,964.0,1462.0
1,Content,6067.0,5793.0,6714.0
2,Recommendations,298.0,274.0,164.0
3,Other_works,697.0,564.0,538.0
4,Classification,440.0,324.0,444.0
5,Reader_response,5757.0,5673.0,5545.0
6,Style,1255.0,746.0,1619.0


In [97]:
occur_freq[occur_freq.sub_cat.notna()][['main_cat', 'sub_cat'] + annotators]

annotator,main_cat,sub_cat,anno1,anno2,anno3
8,Content,Narrative,5255.0,4707.0,6042.0
9,Content,Other,285.0,427.0,223.0
10,Content,Quote,328.0,334.0,351.0
11,Content,Theme,234.0,381.0,131.0
12,Reader_response,Evaluation_of_quality,2976.0,2819.0,3850.0
13,Reader_response,Feelings,1162.0,1374.0,463.0
14,Reader_response,Identification_and_immersion,379.0,427.0,303.0
15,Reader_response,Reading_Context,897.0,618.0,542.0
16,Reader_response,Reception,95.0,80.0,90.0
17,Reader_response,Reflection,1185.0,1214.0,1265.0


### Computing Cohen's Kappa

In [98]:
s = annos[['review_id', 'sent_num', 'annotator']].value_counts()
s[s > 1]
annos.groupby(['review_id', 'sent_num', 'annotator']).max().reset_index()

Unnamed: 0,review_id,sent_num,annotator,sent_id,sent_offset,sent_end,sent_text,CAT__Author,CAT__Content,CAT__Recommendations,...,CAT__Reader_response--Feelings,CAT__Reader_response--Identification_and_immersion,CAT__Reader_response--Reading_Context,CAT__Reader_response--Reception,CAT__Reader_response--Reflection,CAT__Style--Context,CAT__Style--Structure,CAT__Style--Stylistic_features,CAT__None,num_cats
0,impfic-review-10274,1,anno1,impfic-review-10274-sent-001,0,78,Jan van Mersbergen (1971) publiceerde sinds 20...,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,impfic-review-10274,1,anno2,impfic-review-10274-sent-001,0,78,Jan van Mersbergen (1971) publiceerde sinds 20...,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0
2,impfic-review-10274,1,anno3,impfic-review-10274-sent-001,0,78,Jan van Mersbergen (1971) publiceerde sinds 20...,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,impfic-review-10274,2,anno1,impfic-review-10274-sent-002,79,218,op 11 november 2011 verscheen 'Naar de overkan...,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,impfic-review-10274,2,anno2,impfic-review-10274-sent-002,79,218,op 11 november 2011 verscheen 'Naar de overkan...,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34897,impfic-review-84086,7,anno2,impfic-review-84086-sent-007,545,572,Ik ga nu verder met deel 2!,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34898,impfic-review-84086,7,anno3,impfic-review-84086-sent-007,545,572,Ik ga nu verder met deel 2!,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34899,impfic-review-84086,8,anno1,impfic-review-84086-sent-008,573,622,Lees meer op http://readingjournal.princesssam.nl,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34900,impfic-review-84086,8,anno2,impfic-review-84086-sent-008,573,622,Lees meer op http://readingjournal.princesssam.nl,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
from itertools import combinations

import numpy as np


cat_cols = [col for col in annos.columns if col.startswith('CAT__')]

cat = 'CAT__Author--None'


rows = []
for cat in cat_cols:
    cross_tabs = []
    cat_annotator = annos.pivot(index=('review_id', 'sent_num'), columns='annotator', values=cat)
    for anno1, anno2 in combinations(annotators, 2):
        cross = pd.crosstab(index=cat_annotator[anno1], columns=cat_annotator[anno2])
        cross_tabs.append(cross)
    cross_tabs = merge_cross_tabs(cross_tabs)

cat_annotator

#pd.concat({'Foo': cross_list[0]}, names=['Firstlevel'])
cross_list[0]

anno2,0,1
anno1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11114,103
1,461,535


In [100]:
def merge_cross_tabs(cross_tabs):
    cross_merge = {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}}
    for ct in cross_tabs:
        ct_dict = ct.to_dict()
        for row in ct_dict:
            for col in ct_dict[row]:
                cross_merge[row][col] += ct_dict[row][col]
    return pd.DataFrame(cross_merge)

merge_cross_tabs(cross_list)

Unnamed: 0,0,1
0,32180,1829
1,671,1959


## Agreement across all reviewers

In [101]:
cat_cols

['CAT__Author',
 'CAT__Content',
 'CAT__Recommendations',
 'CAT__Other_works',
 'CAT__Classification',
 'CAT__Reader_response',
 'CAT__Style',
 'CAT__Content--Narrative',
 'CAT__Content--Other',
 'CAT__Content--Quote',
 'CAT__Content--Theme',
 'CAT__Reader_response--Evaluation_of_quality',
 'CAT__Reader_response--Feelings',
 'CAT__Reader_response--Identification_and_immersion',
 'CAT__Reader_response--Reading_Context',
 'CAT__Reader_response--Reception',
 'CAT__Reader_response--Reflection',
 'CAT__Style--Context',
 'CAT__Style--Structure',
 'CAT__Style--Stylistic_features',
 'CAT__None']

In [102]:
annos

Unnamed: 0,review_id,sent_id,sent_offset,sent_end,sent_text,annotator,sent_num,CAT__Author,CAT__Content,CAT__Recommendations,...,CAT__Reader_response--Feelings,CAT__Reader_response--Identification_and_immersion,CAT__Reader_response--Reading_Context,CAT__Reader_response--Reception,CAT__Reader_response--Reflection,CAT__Style--Context,CAT__Style--Structure,CAT__Style--Stylistic_features,CAT__None,num_cats
0,impfic-review-10274,impfic-review-10274-sent-001,0,78,Jan van Mersbergen (1971) publiceerde sinds 20...,anno1,1,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,impfic-review-10274,impfic-review-10274-sent-001,0,78,Jan van Mersbergen (1971) publiceerde sinds 20...,anno2,1,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0
2,impfic-review-10274,impfic-review-10274-sent-001,0,78,Jan van Mersbergen (1971) publiceerde sinds 20...,anno3,1,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,impfic-review-10274,impfic-review-10274-sent-002,79,218,op 11 november 2011 verscheen 'Naar de overkan...,anno1,2,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,impfic-review-10274,impfic-review-10274-sent-002,79,218,op 11 november 2011 verscheen 'Naar de overkan...,anno2,2,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34897,impfic-review-84086,impfic-review-84086-sent-007,545,572,Ik ga nu verder met deel 2!,anno2,7,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34898,impfic-review-84086,impfic-review-84086-sent-007,545,572,Ik ga nu verder met deel 2!,anno3,7,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34899,impfic-review-84086,impfic-review-84086-sent-008,573,622,Lees meer op http://readingjournal.princesssam.nl,anno1,8,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34900,impfic-review-84086,impfic-review-84086-sent-008,573,622,Lees meer op http://readingjournal.princesssam.nl,anno2,8,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
from itertools import combinations

import numpy as np


cat_cols = [col for col in annos.columns if col.startswith('CAT__')]

cat = 'CAT__Author--None'


rows = []
for cat in cat_cols:
    cat_annotator = annos.pivot(index=('review_id', 'sent_num'), columns='annotator', values=cat)
    cross_tabs = []
    for anno1, anno2 in combinations(annotators, 2):
        cross = pd.crosstab(index=cat_annotator[anno1], columns=cat_annotator[anno2])
        if 1 not in cross.columns:
            cross[1] = 0
            #cross.loc[1] = [0, 0]
        if 1 not in cross.index:
            cross.loc[1] = [0, 0]
        cross_tabs.append(cross)
    cross = merge_cross_tabs(cross_tabs)
    
    a = cross.loc[0][0]
    b = cross.loc[0][1]
    c = cross.loc[1][0]
    d = cross.loc[1][1]
    p_o = (a + d) / (a + b + c + d)
    p_Y = ((a + b) / (a + b + c + d)) * ((a + c) / (a + b + c + d))
    p_N = ((c + d) / (a + b + c + d)) * ((b + d) / (a + b + c + d))
    p_e = p_Y + p_N
    if p_e == 1.0:
        kappa = np.nan
    else:
        kappa = (p_o - p_e) / (1 - p_e)
    rows.append([cat, anno1, anno2, a, b, c, d, kappa, p_o, p_e, p_Y, p_N])

agreement = pd.DataFrame(rows, columns=['cat', 'anno1', 'anno2', 'a', 'b', 'c', 'd', 'kappa', 'p_o', 'p_e', 'p_Y', 'p_N'])
agreement

Unnamed: 0,cat,anno1,anno2,a,b,c,d,kappa,p_o,p_e,p_Y,p_N
0,CAT__Author,anno2,anno3,30364,976,650,2912,0.755724,0.953412,0.809283,0.797914,0.01136893
1,CAT__Content,anno2,anno3,13737,3238,1944,15983,0.70223,0.851527,0.501383,0.218516,0.2828674
2,CAT__Recommendations,anno2,anno3,33973,59,327,543,0.732314,0.98894,0.958685,0.958255,0.0004299472
3,CAT__Other_works,anno2,anno3,32430,514,832,1126,0.60574,0.961435,0.902183,0.899547,0.002636064
4,CAT__Classification,anno2,anno3,33239,459,451,753,0.609841,0.973927,0.933173,0.931976,0.001197922
5,CAT__Reader_response,anno2,anno3,15530,2185,2609,14578,0.725124,0.862644,0.500298,0.263787,0.2365109
6,CAT__Style,anno2,anno3,29934,1712,984,2272,0.585018,0.922755,0.81386,0.803211,0.01064886
7,CAT__Content--Narrative,anno2,anno3,16171,3514,1940,13277,0.685947,0.843734,0.502421,0.292669,0.2097515
8,CAT__Content--Other,anno2,anno3,33557,348,472,525,0.549481,0.976506,0.94785,0.947136,0.0007145108
9,CAT__Content--Quote,anno2,anno3,33770,142,96,894,0.879018,0.993181,0.943636,0.942794,0.0008419655


In [104]:
import re

temp_df = agreement[['cat', 'kappa']]
temp_df['Category'] = temp_df.cat.apply(lambda x: x.replace('CAT__', '').replace('_', ' '))
temp_df = temp_df.sort_values('Category')
temp_df['Category'] = temp_df.Category.apply(lambda x: re.sub(r'.*--', '~~~~', x))
temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['Category'] = temp_df.cat.apply(lambda x: x.replace('CAT__', '').replace('_', ' '))


Unnamed: 0,cat,kappa,Category
0,CAT__Author,0.755724,Author
4,CAT__Classification,0.609841,Classification
1,CAT__Content,0.70223,Content
7,CAT__Content--Narrative,0.685947,~~~~Narrative
8,CAT__Content--Other,0.549481,~~~~Other
9,CAT__Content--Quote,0.879018,~~~~Quote
10,CAT__Content--Theme,0.39208,~~~~Theme
20,CAT__None,-0.000213,
3,CAT__Other_works,0.60574,Other works
5,CAT__Reader_response,0.725124,Reader response


In [105]:
table = temp_df[['Category', 'kappa']].to_latex(index=False,
                  #formatters={"name": str.upper},
                  float_format="{:.2f}".format,
)
print(table) 

\begin{tabular}{lr}
\toprule
Category & kappa \\
\midrule
Author & 0.76 \\
Classification & 0.61 \\
Content & 0.70 \\
~~~~Narrative & 0.69 \\
~~~~Other & 0.55 \\
~~~~Quote & 0.88 \\
~~~~Theme & 0.39 \\
None & -0.00 \\
Other works & 0.61 \\
Reader response & 0.73 \\
~~~~Evaluation of quality & 0.64 \\
~~~~Feelings & 0.27 \\
~~~~Identification and immersion & 0.53 \\
~~~~Reading Context & 0.56 \\
~~~~Reception & 0.66 \\
~~~~Reflection & 0.42 \\
Recommendations & 0.73 \\
Style & 0.59 \\
~~~~Context & 0.08 \\
~~~~Structure & 0.33 \\
~~~~Stylistic features & 0.58 \\
\bottomrule
\end{tabular}



In [106]:
table = temp_df[['Category', 'kappa']].to_latex(index=False,
                  #formatters={"name": str.upper},
                  float_format="{:.2f}".format,
)
print(table) 

\begin{tabular}{lr}
\toprule
Category & kappa \\
\midrule
Author & 0.76 \\
Classification & 0.61 \\
Content & 0.70 \\
~~~~Narrative & 0.69 \\
~~~~Other & 0.55 \\
~~~~Quote & 0.88 \\
~~~~Theme & 0.39 \\
None & -0.00 \\
Other works & 0.61 \\
Reader response & 0.73 \\
~~~~Evaluation of quality & 0.64 \\
~~~~Feelings & 0.27 \\
~~~~Identification and immersion & 0.53 \\
~~~~Reading Context & 0.56 \\
~~~~Reception & 0.66 \\
~~~~Reflection & 0.42 \\
Recommendations & 0.73 \\
Style & 0.59 \\
~~~~Context & 0.08 \\
~~~~Structure & 0.33 \\
~~~~Stylistic features & 0.58 \\
\bottomrule
\end{tabular}



## Agreement between pair of reviewers

In [107]:
from itertools import combinations

import numpy as np


cat_cols = [col for col in annos.columns if col.startswith('CAT__')]

cat = 'CAT__Author--None'


rows = []
for cat in cat_cols:
    cat_annotator = annos.pivot(index=('review_id', 'sent_num'), columns='annotator', values=cat)
    for anno1, anno2 in combinations(annotators, 2):
        cross = pd.crosstab(index=cat_annotator[anno1], columns=cat_annotator[anno2])
        if 1 not in cross.columns:
            cross[1] = 0
            #cross.loc[1] = [0, 0]
        if 1 not in cross.index:
            cross.loc[1] = [0, 0]

        a = cross.loc[0][0]
        b = cross.loc[0][1]
        c = cross.loc[1][0]
        d = cross.loc[1][1]
        p_o = (a + d) / (a + b + c + d)
        p_Y = ((a + b) / (a + b + c + d)) * ((a + c) / (a + b + c + d))
        p_N = ((c + d) / (a + b + c + d)) * ((b + d) / (a + b + c + d))
        p_e = p_Y + p_N
        if p_e == 1.0:
            kappa = np.nan
        else:
            kappa = (p_o - p_e) / (1 - p_e)
        rows.append([cat, anno1, anno2, a, b, c, d, kappa, p_o, p_e, p_Y, p_N])

agreement = pd.DataFrame(rows, columns=['cat', 'anno1', 'anno2', 'a', 'b', 'c', 'd', 'kappa', 'p_o', 'p_e', 'p_Y', 'p_N'])
agreement

Unnamed: 0,cat,anno1,anno2,a,b,c,d,kappa,p_o,p_e,p_Y,p_N
0,CAT__Author,anno1,anno2,10204,131,466,833,0.708457,0.948685,0.823988,0.814736,9.251839e-03
1,CAT__Author,anno1,anno3,10064,271,108,1191,0.844322,0.967423,0.790741,0.776710,1.403132e-02
2,CAT__Author,anno2,anno3,10096,574,76,888,0.702342,0.944129,0.812299,0.801886,1.041277e-02
3,CAT__Content,anno1,anno2,4825,742,1016,5051,0.697836,0.848891,0.499911,0.240243,2.596686e-01
4,CAT__Content,anno1,anno3,4470,1097,450,5617,0.732281,0.867028,0.503314,0.202362,3.009520e-01
...,...,...,...,...,...,...,...,...,...,...,...,...
58,CAT__Style--Stylistic_features,anno1,anno3,9973,668,133,860,0.645600,0.931150,0.805728,0.794518,1.121023e-02
59,CAT__Style--Stylistic_features,anno2,anno3,10030,973,76,555,0.473724,0.909833,0.828670,0.821547,7.123519e-03
60,CAT__None,anno1,anno2,11607,0,27,0,0.000000,0.997679,0.997679,0.997679,0.000000e+00
61,CAT__None,anno1,anno3,11605,2,27,0,-0.000320,0.997507,0.997508,0.997508,3.989658e-07


In [108]:
agreement.kappa

0     0.708457
1     0.844322
2     0.702342
3     0.697836
4     0.732281
        ...   
58    0.645600
59    0.473724
60    0.000000
61   -0.000320
62    0.000000
Name: kappa, Length: 63, dtype: float64

In [109]:
# separate main category and sub-category
agreement['main_cat'], agreement['sub_cat'] = zip(*agreement.cat.apply(map_cat_string))

# remove invalid main and sub category combinations
agreement = agreement[agreement.apply(lambda row: (row['main_cat'], row['sub_cat']) in cat_tuples, axis=1)]


In [110]:
agreement

Unnamed: 0,cat,anno1,anno2,a,b,c,d,kappa,p_o,p_e,p_Y,p_N,main_cat,sub_cat
0,CAT__Author,anno1,anno2,10204,131,466,833,0.708457,0.948685,0.823988,0.814736,0.009252,Author,
1,CAT__Author,anno1,anno3,10064,271,108,1191,0.844322,0.967423,0.790741,0.77671,0.014031,Author,
2,CAT__Author,anno2,anno3,10096,574,76,888,0.702342,0.944129,0.812299,0.801886,0.010413,Author,
3,CAT__Content,anno1,anno2,4825,742,1016,5051,0.697836,0.848891,0.499911,0.240243,0.259669,Content,
4,CAT__Content,anno1,anno3,4470,1097,450,5617,0.732281,0.867028,0.503314,0.202362,0.300952,Content,
5,CAT__Content,anno2,anno3,4442,1399,478,5315,0.67753,0.838663,0.499682,0.212322,0.28736,Content,
6,CAT__Recommendations,anno1,anno2,11291,45,69,229,0.795685,0.990201,0.95204,0.951437,0.000603,Recommendations,
7,CAT__Recommendations,anno1,anno3,11332,4,138,160,0.686948,0.987794,0.961011,0.96065,0.000361,Recommendations,
8,CAT__Recommendations,anno2,anno3,11350,10,120,154,0.697868,0.988826,0.963016,0.962684,0.000332,Recommendations,
9,CAT__Other_works,anno1,anno2,10781,156,289,408,0.627122,0.96175,0.89742,0.894515,0.002904,Other_works,


In [111]:
agreement[agreement.sub_cat.isna()].groupby('main_cat').kappa.describe(percentiles=[0.5])

Unnamed: 0_level_0,count,mean,std,min,50%,max
main_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Author,3.0,0.751707,0.080265,0.702342,0.708457,0.844322
Classification,3.0,0.611413,0.034051,0.581381,0.60445,0.648408
Content,3.0,0.702549,0.027678,0.67753,0.697836,0.732281
Other_works,3.0,0.602891,0.066621,0.527546,0.627122,0.654005
Reader_response,3.0,0.725112,0.008669,0.715676,0.726937,0.732723
Recommendations,3.0,0.726834,0.059877,0.686948,0.697868,0.795685
Style,3.0,0.58191,0.081224,0.501708,0.579903,0.664119


In [112]:
agreement[agreement.sub_cat.isna()].groupby('main_cat').kappa.describe(percentiles=[0.5])

Unnamed: 0_level_0,count,mean,std,min,50%,max
main_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Author,3.0,0.751707,0.080265,0.702342,0.708457,0.844322
Classification,3.0,0.611413,0.034051,0.581381,0.60445,0.648408
Content,3.0,0.702549,0.027678,0.67753,0.697836,0.732281
Other_works,3.0,0.602891,0.066621,0.527546,0.627122,0.654005
Reader_response,3.0,0.725112,0.008669,0.715676,0.726937,0.732723
Recommendations,3.0,0.726834,0.059877,0.686948,0.697868,0.795685
Style,3.0,0.58191,0.081224,0.501708,0.579903,0.664119


In [113]:
temp_df = agreement[agreement.sub_cat.isna()].groupby('main_cat').kappa.describe(percentiles=[0.5])['mean'].reset_index().rename(columns={'main_cat': 'Category', 'mean': 'Kappa'})

table = temp_df[['Category', 'Kappa']].to_latex(index=False,
                  #formatters={"name": str.upper},
                  float_format="{:.2f}".format,
)
print(table) 

\begin{tabular}{lr}
\toprule
Category & Kappa \\
\midrule
Author & 0.75 \\
Classification & 0.61 \\
Content & 0.70 \\
Other_works & 0.60 \\
Reader_response & 0.73 \\
Recommendations & 0.73 \\
Style & 0.58 \\
\bottomrule
\end{tabular}



In [114]:
agreement['Sub-category'] = agreement.sub_cat.apply(lambda x: '~' if x is None else x)#.groupby(['main_cat', 'sub_cat']).kappa.describe(percentiles=[0.5])['mean']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  agreement['Sub-category'] = agreement.sub_cat.apply(lambda x: '~' if x is None else x)#.groupby(['main_cat', 'sub_cat']).kappa.describe(percentiles=[0.5])['mean']


In [115]:
temp_df = agreement.groupby(['main_cat', 'Sub-category']).kappa.describe(percentiles=[0.5])['mean'].reset_index().rename(columns={'main_cat': 'Category', 'mean': 'Kappa'})

table = temp_df[['Category', 'Sub-category', 'Kappa']].to_latex(index=False,
                  #formatters={"name": str.upper},
                  float_format="{:.2f}".format,
)
print(table) 

\begin{tabular}{llr}
\toprule
Category & Sub-category & Kappa \\
\midrule
Author & ~ & 0.75 \\
Classification & ~ & 0.61 \\
Content & Narrative & 0.69 \\
Content & Other & 0.56 \\
Content & Quote & 0.88 \\
Content & Theme & 0.40 \\
Content & ~ & 0.70 \\
Other_works & ~ & 0.60 \\
Reader_response & Evaluation_of_quality & 0.64 \\
Reader_response & Feelings & 0.25 \\
Reader_response & Identification_and_immersion & 0.53 \\
Reader_response & Reading_Context & 0.56 \\
Reader_response & Reception & 0.66 \\
Reader_response & Reflection & 0.42 \\
Reader_response & ~ & 0.73 \\
Recommendations & ~ & 0.73 \\
Style & Context & 0.08 \\
Style & Structure & 0.35 \\
Style & Stylistic_features & 0.58 \\
Style & ~ & 0.58 \\
\bottomrule
\end{tabular}



In [116]:
agreement[agreement.sub_cat.isna()].groupby('main_cat').p_e.describe(percentiles=[0.5])

Unnamed: 0_level_0,count,mean,std,min,50%,max
main_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Author,3.0,0.809009,0.016866,0.790741,0.812299,0.823988
Classification,3.0,0.933151,0.005413,0.926903,0.936112,0.936437
Content,3.0,0.500969,0.002034,0.499682,0.499911,0.503314
Other_works,3.0,0.902189,0.006631,0.89742,0.899387,0.909761
Reader_response,3.0,0.500316,0.000235,0.500128,0.500241,0.500579
Recommendations,3.0,0.958689,0.005845,0.95204,0.961011,0.963016
Style,3.0,0.81313,0.029451,0.782989,0.814563,0.841838


In [26]:
agreement[agreement.sub_cat.notna()].groupby(['main_cat', 'sub_cat']).kappa.describe(percentiles=[0.5])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,50%,max
main_cat,sub_cat,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Content,Narrative,3.0,0.667113,0.02589,0.637246,0.68091,0.683182
Content,Other,3.0,0.554215,0.102233,0.457098,0.544654,0.660893
Content,Quote,3.0,0.861204,0.05425,0.823273,0.836996,0.923343
Content,Theme,3.0,0.394403,0.07108,0.31811,0.406341,0.458759
Reader_response,Evaluation_of_quality,3.0,0.632382,0.009416,0.62458,0.629724,0.64284
Reader_response,Feelings,3.0,0.253741,0.133732,0.145539,0.212429,0.403254
Reader_response,Identification_and_immersion,3.0,0.524273,0.052122,0.482809,0.507226,0.582784
Reader_response,Reading_Context,3.0,0.557816,0.024573,0.538892,0.548967,0.585588
Reader_response,Reception,3.0,0.653089,0.017593,0.637421,0.649725,0.672121
Reader_response,Reflection,3.0,0.408849,0.041183,0.37415,0.398038,0.454359


In [27]:
agreement[agreement.main_cat == 'Author']

Unnamed: 0,cat,anno1,anno2,a,b,c,d,kappa,p_o,p_e,p_Y,p_N,main_cat,sub_cat
0,CAT__Author,anno1,anno2,10767,140,472,834,0.704589,0.949889,0.83037,0.821842,0.008528,Author,
1,CAT__Author,anno1,anno3,10596,311,108,1198,0.83188,0.965692,0.795933,0.78272,0.013213,Author,
2,CAT__Author,anno2,anno3,10628,611,76,898,0.69362,0.943748,0.8164,0.806546,0.009854,Author,


In [28]:
agreement[agreement.main_cat == 'Classification']

Unnamed: 0,cat,anno1,anno2,a,b,c,d,kappa,p_o,p_e,p_Y,p_N,main_cat,sub_cat
12,CAT__Classification,anno1,anno2,11690,82,189,252,0.63909,0.977811,0.938518,0.937531,0.000988,Classification,
13,CAT__Classification,anno1,anno3,11568,204,177,264,0.564672,0.968804,0.928338,0.926955,0.001384,Classification,
14,CAT__Classification,anno2,anno3,11655,224,90,244,0.595571,0.97429,0.936428,0.93538,0.001048,Classification,


In [29]:
agreement[agreement.main_cat == 'Content']

Unnamed: 0,cat,anno1,anno2,a,b,c,d,kappa,p_o,p_e,p_Y,p_N,main_cat,sub_cat
3,CAT__Content,anno1,anno2,5248,861,1046,5058,0.687706,0.843855,0.500006,0.257782,0.242225,Content,
4,CAT__Content,anno1,anno3,4704,1405,451,5653,0.696081,0.848031,0.499968,0.211132,0.288836,Content,
5,CAT__Content,anno2,anno3,4676,1618,479,5440,0.658231,0.828298,0.497608,0.217526,0.280082,Content,
30,CAT__Content--Narrative,anno1,anno2,6230,700,1179,4104,0.683182,0.846148,0.514382,0.34423,0.170153,Content,Narrative
31,CAT__Content--Narrative,anno1,anno3,5417,1513,446,4837,0.68091,0.839597,0.497311,0.272401,0.22491,Content,Narrative
32,CAT__Content--Narrative,anno2,anno3,5519,1890,344,4460,0.637246,0.81708,0.495747,0.291229,0.204518,Content,Narrative
33,CAT__Content--Other,anno1,anno2,11698,230,86,199,0.544654,0.974126,0.943177,0.942357,0.00082,Content,Other
34,CAT__Content--Other,anno1,anno3,11870,58,113,172,0.660893,0.985999,0.958711,0.958271,0.000439,Content,Other
35,CAT__Content--Other,anno2,anno3,11709,75,274,155,0.457098,0.971424,0.947364,0.946703,0.000662,Content,Other
36,CAT__Content--Quote,anno1,anno2,11814,71,46,282,0.823273,0.99042,0.945792,0.945016,0.000776,Content,Quote


## Confusion Matrix

There may be an inverse relationship between `Evaluation of quality` and `feelings`. One annotator is much more likely than the other two to use `Evaluation of Quality`, whereas the other two are much more likely to use `feelings`. But it's not clear that this happens for the same sentences, i.e. whether there are many cases where one annotator selects `Evaluation of Quality` for sentences where the other two select `feelings`.

In [30]:
annos.sent_id

0        impfic-review-10274-sent-001
1        impfic-review-10274-sent-001
2        impfic-review-10274-sent-001
3        impfic-review-10274-sent-002
4        impfic-review-10274-sent-002
                     ...             
36634    impfic-review-85460-sent-023
36635    impfic-review-85460-sent-023
36636    impfic-review-85460-sent-024
36637    impfic-review-85460-sent-024
36638    impfic-review-85460-sent-024
Name: sent_id, Length: 36639, dtype: object

In [31]:
cat1 = 'CAT__Reader_response--Evaluation_of_quality'
cat2 = 'CAT__Reader_response--Feelings'
selected_sents = annos[(annos.annotator == 'anno3') & (annos[cat1] > 0)].sent_id
annos[(annos.annotator != 'anno3') & (annos.sent_id.isin(selected_sents))][cat_cols].sum()
#anno_df.columns

annos[['sent_id', 'annotator', cat1, cat2]].set_index(['sent_id', 'annotator'])#.unstack()
annos.pivot(index='sent_id', columns='annotator', values=[cat1, cat2])

part1 = annos[['sent_id', 'annotator', cat1]].rename(columns={cat1: 'eval'})
part1['annotator'] = part1.annotator.apply(lambda x: f"eval_{x.split('-')[0]}")

part2 = annos[['sent_id', 'annotator', cat2]].rename(columns={cat2: 'feel'})
part2['annotator'] = part2.annotator.apply(lambda x: f"feel_{x.split('-')[0]}")

temp = pd.concat([
part1.set_index(['sent_id', 'annotator']).unstack(),
part2.set_index(['sent_id', 'annotator']).unstack()
], axis=1)

temp = temp.droplevel(0, axis=1)

col_order = [
    'eval_anno1', 'eval_anno2', 
    'feel_anno1', 'feel_anno2',
    'eval_anno3', 
    'feel_anno3'
]
temp[col_order].value_counts().sort_index()

eval_anno1  eval_anno2  feel_anno1  feel_anno2  eval_anno3  feel_anno3
0           0           0           0           0           0             6940
                                                            1               13
                                                1           0              496
                                                            1               18
                                    1           0           0              248
                                                            1               10
                                                1           0              110
                                                            1                7
                        1           0           0           0              174
                                                            1               18
                                                1           0               79
                                                            