In [1]:
import re
import pke
import math
import nltk
import random
import numpy as np
import pandas as pd
from nltk import stem
from nltk.corpus import stopwords
from collections import defaultdict

nltk.download('stopwords')
stops = stopwords.words('english')

stemmer = stem.PorterStemmer()

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


In [2]:
def clean_string(string):
    return re.sub(r'\W+','', string).lower() 
    
def stem_string(string):
    return stemmer.stem(string, to_lowercase=True)
    
def list_word_process(lst):
    for kp in lst:
        for word in kp.split():
            clean_word = re.sub('[()]', '', word)
            stem = stem_string(clean_word)
            if stem not in stops:
                yield(stem)

def word_exact_match(lst1, lst2):
    return exact_match(list_word_process(lst1), list_word_process(lst2))
    
def exact_match(lst1, lst2):
    if lst1 == lst2:
        return 1
    lst1 = list(map(stem_string, lst1))
    lst2 = list(map(stem_string, lst2))
    inter = list(set(lst1) & set(lst2))
    union = list(set(lst1) | set(lst2))
    return len(inter) / len(union) if len(union) > 0 else 0

def score(annotator1_keyphrases, annotator2_keyphrases, func=exact_match):
    return sum([func(anot1, anot2) for anot1, anot2 in zip(annotator1_keyphrases, annotator2_keyphrases)])/ max(len(annotator1_keyphrases), len(annotator2_keyphrases))

def matrix_annotations(annotations_dict, func=word_exact_match, cols=[]):
    data = defaultdict(lambda : defaultdict(list))
    for k, annotations in annotations_dict.items():
        for col in cols:
            data[k][col]= score(annotations, annotations_dict[col], func=func)
    return pd.DataFrame(data).transpose()

def split_queries_keyphrases(queries):
    queries = queries.replace(np.nan, "")
    return [[v.strip() for v in query.strip().split(",")] if query.strip().split(",") != [''] else [] for query in queries]

In [3]:
reading_lists = pd.read_csv("reading_lists.csv")
reading_lists = reading_lists.replace(np.nan, None)

queries_kps = {}
for annotator_i in [1,2,3, "bart"]:
    queries_kps[annotator_i] = split_queries_keyphrases(pd.read_csv(f"annotations/annotation_{annotator_i}.csv")["query_keywords"])

# Keyphrases queries

In [4]:
pd.DataFrame({
    "A1": np.mean([len(kws) for kws in queries_kps[1]]),
    "A2": np.mean([len(kws) for kws in queries_kps[2]]), 
    "A3": np.mean([len(kws) for kws in queries_kps[3]]), 
    "Bart-Large-KP20K": np.mean([len(kws) for kws in queries_kps["bart"]])
}, index=["Mean # of KP"]).transpose()

Unnamed: 0,Mean # of KP
A1,2.705882
A2,3.835294
A3,2.341176
Bart-Large-KP20K,4.447059


## Agreement + comparison to baselines

In [5]:
firstphrase_title_keyphrases = []

for title in reading_lists["title"]:
    extractor = pke.unsupervised.FirstPhrases()
    extractor.load_document(input=title, language='en')
    extractor.candidate_selection()
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=3)
    firstphrase_title_keyphrases.append([kp_tuple[0] for kp_tuple in keyphrases])

In [6]:
topicrank_keyphrases = []

for title,abstract in zip(reading_lists["title"],reading_lists["abstract"]):
    extractor = pke.unsupervised.TopicRank()
    extractor.load_document(input=title+"\n "+(abstract or ""), language='en')
    extractor.candidate_selection()
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=3)
    topicrank_keyphrases.append([kp_tuple[0] for kp_tuple in keyphrases])

In [7]:
np.mean([len(kws) for kws in firstphrase_title_keyphrases])

2.3529411764705883

In [8]:
np.mean([len(kws) for kws in topicrank_keyphrases])

3.0

In [9]:
columns = ["A1", "A2", "A3"]
annotations = {
    "A1":queries_kps[1],
    "A2":queries_kps[2], 
    "A3":queries_kps[3], 

    "FirstPhrase Title":firstphrase_title_keyphrases, 
    "TopicRank":topicrank_keyphrases, 
    "Bart-Large-KP20K":queries_kps["bart"],
}

In [10]:
df = matrix_annotations(annotations, func=word_exact_match, cols=columns); df
#print(df.to_latex(float_format="%.2f"))

Unnamed: 0,A1,A2,A3
A1,1.0,0.305374,0.537675
A2,0.305374,1.0,0.313214
A3,0.537675,0.313214,1.0
FirstPhrase Title,0.40328,0.269857,0.404577
TopicRank,0.368888,0.230942,0.40787
Bart-Large-KP20K,0.412099,0.325121,0.380824


# Differences between anotators

In [11]:
def check_differences(preds, manual_annotations, threshold=0.05):
    for k, annotations in manual_annotations.items():
        for annotation_manual, annotation_preds in zip(annotations, preds):
            if word_exact_match(annotation_manual, annotation_preds)<threshold:
                print(k, annotation_manual, annotation_preds)

manual_annotations = {
    "A1":queries_kps[1],
    "A2":queries_kps[2], 
    "A3":queries_kps[3], 
}

In [12]:
check_differences(queries_kps[1], manual_annotations)

A2 ['annotation bias', 'natural language processing (nlp)', 'subjectivity', 'socio cultural annotation'] ['geo-cultural representation', 'socio-cultural identity', 'reinforcement learning from human feedback (RLHF)']
A3 ['annotation', 'cross-cultural differences', 'language technologies'] ['geo-cultural representation', 'socio-cultural identity', 'reinforcement learning from human feedback (RLHF)']


In [13]:
check_differences(queries_kps[2], manual_annotations)

A1 ['geo-cultural representation', 'socio-cultural identity', 'reinforcement learning from human feedback (RLHF)'] ['annotation bias', 'natural language processing (nlp)', 'subjectivity', 'socio cultural annotation']
A3 ['DBpedia Databus'] ['knowledge graph', 'accessibility', 'structuration']
A3 ['text representation', 'text embedding', 'control', 'analysis'] ['language model', 'formal semantics', 'distributional models']


In [14]:
check_differences(queries_kps[3], manual_annotations)

A1 ['geo-cultural representation', 'socio-cultural identity', 'reinforcement learning from human feedback (RLHF)'] ['annotation', 'cross-cultural differences', 'language technologies']
A2 ['knowledge graph', 'accessibility', 'structuration'] ['DBpedia Databus']
A2 ['language model', 'formal semantics', 'distributional models'] ['text representation', 'text embedding', 'control', 'analysis']


## Differences between automatic and manual

In [15]:
check_differences(queries_kps['bart'], manual_annotations)

A1 ['human-nlp model interactions'] ['natural language processing', 'human-in-the-loop usability evaluation', 'model-based user interface design']
A2 ['zero shot learning', 'few shot learning', 'review'] ['few-shot', 'language models', 'zero-shot']
A3 ['human-NLP model interactions'] ['natural language processing', 'human-in-the-loop usability evaluation', 'model-based user interface design']


# Sentence queries

In [16]:
queries_sentences = {}
for annotator_i in [1,2,3]:
    annotator_sentences = [query.strip() for query in pd.read_csv(f"annotations/annotation_{annotator_i}.csv")["query_sentence"].replace(np.nan, "")]
    queries_sentences[annotator_i] = [[clean_string(stem_string(w)) for w in s.split() if w not in stops] for s in annotator_sentences]    

In [17]:
columns = ["A1", "A2", "A3"]
annotations_sentences = {
    "A1":queries_sentences[1],
    "A2":queries_sentences[2], 
    "A3":queries_sentences[3], 
}

In [18]:
df = matrix_annotations(annotations_sentences, func=word_exact_match, cols=columns); df
#print(df.to_latex(float_format="%.2f"))

Unnamed: 0,A1,A2,A3
A1,1.0,0.411043,0.603139
A2,0.411043,1.0,0.406821
A3,0.603139,0.406821,1.0
