In [33]:
import os
import json
import pandas as pd
from glob import glob
import xml.etree.ElementTree as ET
import math
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk import word_tokenize
from rouge_score import rouge_scorer
from sklearn.metrics import cohen_kappa_score

In [2]:
def clean_sentence(text):
    text = text.replace("&quot;",'"')
    text = text.replace("-",' ')
    return text

In [3]:
def remove_stop_words(sentence):
    tokens = word_tokenize(sentence)
    cleaned_tokens = []
    for token in tokens:
        if token not in stops:
            cleaned_tokens.append(token)
    return " ".join(cleaned_tokens)

In [4]:
stops = set(stopwords.words('english'))

In [5]:
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2'], use_stemmer=True)

In [6]:
base_dir = "/home/xxl190027/scisumm-corpus/data/Test-Set-2018/"
files = glob(base_dir+"*")

In [7]:
full_texts = {}
for file in files:
    paper_name = os.path.split(file)[-1]
    xml_path = file + "/Reference_XML/"+paper_name+".xml"
    tree = ET.parse(xml_path)
    root = tree.getroot()
    sentences = {}
    for node in root.findall("S"):
        sentence = node.text.strip().replace("  "," ")
        sentence = clean_sentence(sentence)
        sid = int(node.attrib["sid"])
        sentences[sid] = {
            "text": sentence,
            "no_stop": remove_stop_words(sentence).lower(),
        }
    for abstract_node in root.findall("ABSTRACT"):
        for node in abstract_node.findall("S"):
            sentence = node.text.strip().replace("  "," ")
            sentence = clean_sentence(sentence)
            sid = int(node.attrib["sid"])
            sentences[sid] = {
                "text": sentence,
                "no_stop": remove_stop_words(sentence).lower(),
            }
    for section_node in root.findall("SECTION"):
        for node in section_node.findall("S"):
            sentence = node.text.strip().replace("  "," ")
            sentence = clean_sentence(sentence)
            sid = int(node.attrib["sid"])
            sentences[sid] = {
                "text": sentence,
                "no_stop": remove_stop_words(sentence).lower(),
            }
    full_texts[paper_name] = sentences

In [22]:
len(full_texts['A00-2018'])

191

In [8]:
base_dir_gold = "/home/xxl190027/scisumm-corpus/data/Test-Set-2018-Gold/Task1/"
gold_files = glob(base_dir_gold+"*")

In [25]:
all_annotations = {}
annotators = set([])
for gold_file in gold_files:
    gold_file_name = os.path.split(gold_file)[-1]
    paper_name, annotator = gold_file_name.split(".")[0].split("_")
    annotators.add(annotator)
    df = pd.read_csv(gold_file)
    this_paper = all_annotations.get(paper_name, {})
    for i in range(len(df)):
        this_citance = this_paper.get(df["Citance Number"][i], {
            "Citing Article": df["Citing Article"][i],
            "Citation Text": df["Citation Text Clean"][i],
            "CTS": {},
        })
        
        #print(this_citance["Citation Text"] , df["Citation Text Clean"][i])
        if this_citance["Citing Article"] != df["Citing Article"][i]:
            continue
            #print(paper_name, annotator, df["Citance Number"][i], this_citance["Citing Article"] , df["Citing Article"][i])
        #assert(this_citance["Citing Article"] == df["Citing Article"][i])
        #assert(this_citance["Citation Text"] == df["Citation Text Clean"][i])
        raw_offsets = df["Reference Offset"][i]
        offsets = []
        if type(raw_offsets)==str and raw_offsets not in {"0","???"}:
            raw_offsets = raw_offsets.replace(";",",")
            for num_str in raw_offsets.split(","):
                cleaned = num_str.strip().replace("'","")
                if cleaned:
                    offsets.append(int(cleaned))
        elif type(raw_offsets) == np.float64 and not np.isnan(raw_offsets):
            offsets.append(int(raw_offsets))
        elif type(raw_offsets) == np.int64:
            offsets.append(int(raw_offsets))
        this_citance["CTS"][annotator] = offsets
        this_paper[df["Citance Number"][i]] = this_citance
    all_annotations[paper_name] = this_paper

In [27]:
all_annotations

{'A00-2018': {2: {'Citing Article': 'N10-1002',
   'Citation Text': 'As a benchmark VPC extraction system, we use the Charniak parser (Charniak, 2000)',
   'CTS': {'akanksha': [90, 91], 'sweta': [17], 'vardha': [5]}},
  3: {'Citing Article': 'W11-0610',
   'Citation Text': 'Each of these scores can be calculated from a provided syntactic parse tree, and to generate these we made use of the Charniak parser (Charniak, 2000), also trained on the Switch board tree bank',
   'CTS': {'akanksha': [5], 'sweta': [5], 'vardha': [5]}},
  4: {'Citing Article': 'W06-3119',
   'Citation Text': "We then use Charniak's parser (Charniak, 2000) to generate the most likely parse tree for each English target sentence in the training corpus",
   'CTS': {'akanksha': [90], 'sweta': [17], 'vardha': [91]}},
  5: {'Citing Article': 'N03-2024',
   'Citation Text': "We were interested in the occurrence of features such as type and number of premodifiers, presence and type of post modifiers, and form of name refer

In [28]:
def cts2feature(cts,length):
    feature = [0] * length
    for idx in cts:
        feature[idx] = 1
    return feature

In [38]:
annotators = list(annotators)

In [40]:
agreements = []
for i in range(len(annotators)):
    for j in range(i+1, len(annotators)):
        annotator1 = annotators[i]
        annotator2 = annotators[j]
        
        annotator1_feature = []
        annotator2_feature = []
        for paper_name, citations in all_annotations.items():
            length = len(full_texts[paper_name])
            for citation_id, citation in citations.items():
                if annotator1 in citation["CTS"] and annotator2 in citation["CTS"]:
                    annotator1_feature.extend(cts2feature(citation["CTS"][annotator1],length))
                    annotator2_feature.extend(cts2feature(citation["CTS"][annotator2],length))
        if len(annotator1_feature) > 0:
            kappa = cohen_kappa_score(annotator1_feature, annotator2_feature)
            agreements.append(kappa)
            print(annotator1, annotator2, kappa)

aakansha vardha 0.15304907018520342
aakansha sweta 0.10517203569560496
aakansha swastika 0.19300604363524587
vardha akanksha 0.17632965979875415
vardha sweta 0.15167721193908912
vardha swastika 0.17373271035953186
akanksha sweta 0.1719976567076743
sweta swastika 0.15432271735922465


In [41]:
np.mean(agreements)

0.15991088821004104

In [11]:
annotations = {}
for paper, this_paper in tqdm(all_annotations.items()):
    reference_full_text = full_texts[paper]
    for citance_id, this_citance in this_paper.items():
        example_id = paper + "_" + str(citance_id)
        offsets = []
        for annotator, offset in this_citance["CTS"].items():
            offsets.extend(offset)
        offsets = set(offsets)
        query = remove_stop_words(this_citance["Citation Text"]).lower()
        similarities = []
        for si, sent in reference_full_text.items():
            scores = scorer.score(query, sent["no_stop"])
            performance = (scores["rouge1"].recall + scores["rouge2"].recall) / 2
            similarities.append((si, performance))
        sorted_similarity = sorted(similarities, key=lambda x: x[1], reverse=True)
        ranked_si = [t[0] for t in sorted_similarity]
        annotations[example_id] = {
            "annotation": offsets,
            "rankings": ranked_si,
        }

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:49<00:00,  2.47s/it]


In [12]:
for paper, this_paper in tqdm(all_annotations.items()):
    reference_full_text = full_texts[paper]
    print(len(reference_full_text))

100%|████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 33825.03it/s]

191
113
176
200
154
147
415
194
114
203
198
142
175
165
233
111
179
288
257
149





In [16]:
hit_all = 0
total_all = 0
total_proposed = 0
recall_all = []
f1_all = []
precision_all = []
for k in range(1,41):
    for example, annotation in annotations.items():
        prediction = annotation["rankings"][:k]
        hit = annotation["annotation"].intersection(set(prediction))
        hit_all += len(hit)
        total_all += len(annotation["annotation"])
        total_proposed += k
    recall = hit_all / total_all
    precision = hit_all / total_proposed
    f1 = 2 * precision * recall / (recall + precision)
    print(k, precision, recall, f1)
    recall_all.append(recall)
    precision_all.append(precision)
    f1_all.append(f1)

1 0.260989010989011 0.10474090407938258 0.14948859166011014
2 0.2261904761904762 0.13616317530319735 0.16999311768754302
3 0.20467032967032966 0.16427783902976847 0.18226299694189602
4 0.18626373626373627 0.1868798235942668 0.18657127132636211
5 0.17435897435897435 0.20992282249173097 0.19049524762381193
6 0.16457352171637885 0.23116501286291805 0.1922665443985939
7 0.15600470957613816 0.25043313907702003 0.19224956169518168
8 0.14888583638583638 0.26888092613009923 0.19165029469548134
9 0.14236874236874236 0.28567928457674874 0.19003381819663445
10 0.13621378621378621 0.30066152149944875 0.18748710897215537
11 0.13066100566100566 0.31462363435902574 0.18464162818740623
12 0.12584530853761622 0.3282800441014333 0.1819431714023831
13 0.1216036710542205 0.341616487151217 0.17936101525102974
14 0.11768707482993197 0.3542290124429044 0.17667622451785225
15 0.11414835164835165 0.3664829106945976 0.17407698350353498
16 0.11085972850678733 0.37816979051819183 0.17145713571607096
17 0.10777131

In [13]:
hit_any = 0
total_any = 0
hit_all = 0
total_all = 0
recall_any = []
recall_all = []
for k in range(1,41):
    for example, annotation in annotations.items():
        prediction = annotation["rankings"][:k]
        hit = annotation["annotation"].intersection(set(prediction))
        if len(hit) > 0:
            hit_any += 1
        hit_all += len(hit)
        total_any += 1
        total_all += len(annotation["annotation"])
    print(k, hit_any / total_any, hit_all / total_all)
    recall_any.append(hit_any / total_any)
    recall_all.append(hit_all / total_all)

1 0.260989010989011 0.10474090407938258
2 0.32005494505494503 0.13616317530319735
3 0.3708791208791209 0.16427783902976847
4 0.4100274725274725 0.1868798235942668
5 0.4472527472527473 0.20992282249173097
6 0.47893772893772896 0.23116501286291805
7 0.5070643642072213 0.25043313907702003
8 0.5336538461538461 0.26888092613009923
9 0.5567765567765568 0.28567928457674874
10 0.5766483516483516 0.30066152149944875
11 0.5951548451548452 0.31462363435902574
12 0.6130952380952381 0.3282800441014333
13 0.6297548605240912 0.341616487151217
14 0.6452119309262166 0.3542290124429044
15 0.6593406593406593 0.3664829106945976
16 0.6723901098901099 0.37816979051819183
17 0.6840659340659341 0.38926000389130294
18 0.6949023199023199 0.40003675119441384
19 0.7050318102949682 0.4106075552718621
20 0.7142857142857143 0.421058434399118
21 0.7227891156462585 0.4310915104740904
22 0.7307692307692307 0.4406134108449434
23 0.7386526516961299 0.45002636498729687
24 0.7461080586080586 0.45916023520764426
25 0.753076

In [None]:
recall_any

In [None]:
recall_all