In [247]:
import re
import pke
import math
import nltk
import random
import numpy as np
import pandas as pd
from nltk import stem
from nltk.corpus import stopwords
from collections import defaultdict

nltk.download('stopwords')
stops = stopwords.words('english')

stemmer = stem.PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/e154817e/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [248]:
def clean_string(string):
    return re.sub(r'\W+','', string).lower() 
    
def stem_string(string):
    return stemmer.stem(string, to_lowercase=True)
    
def list_word_process(lst):
    return [stem_string(word) for kp in lst for word in kp.split() if stem_string(word) not in stops]

def anotation_distance(annot_a, annot_b):
    if annot_a == annot_b:
        return 1
    return sum([abs(len(kp_a) - len(kp_b)) for kp_a, kp_b in zip(annot_a, annot_b)]) / len(reading_lists)

def word_exact_match(lst1, lst2):
    return exact_match(list_word_process(lst1), list_word_process(lst2))
    
def exact_match(lst1, lst2):
    if lst1 == lst2:
        return 1
    lst1 = list(map(stem_string, lst1))
    lst2 = list(map(stem_string, lst2))
    inter = list(set(lst1) & set(lst2))
    union = list(set(lst1) | set(lst2))
    return len(inter) / len(union) if len(union) > 0 else 0

def score(annotator1_keyphrases, annotator2_keyphrases, func=exact_match):
    return sum([func(anot1, anot2) for anot1, anot2 in zip(annotator1_keyphrases, annotator2_keyphrases)])/ max(len(annotator1_keyphrases), len(annotator2_keyphrases))

def matrix_annotations(annotations_dict, func=word_exact_match, cols=[]):
    data = defaultdict(lambda : defaultdict(list))
    for k, annotations in annotations_dict.items():
        for col in cols:
            data[k][col]= score(annotations, annotations_dict[col], func=func)
    return pd.DataFrame(data).transpose()

def split_queries_keyphrases(queries):
    queries = queries.replace(np.nan, "")
    return [[v.strip() for v in query.strip().split(",")] if query.strip().split(",") != [''] else [] for query in queries]

In [249]:
reading_lists = pd.read_csv("../reading_lists.csv")
reading_lists = reading_lists.replace(np.nan, None)

queries_kps = {}
for annotator_i in [1,2,3, "bart"]:
    queries_kps[annotator_i] = [split_queries_keyphrases(pd.read_csv(f"annotations/annotation_{annotator_i}.csv")["query_keywords"])]

# Keyphrases queries

## KP distance difference

In [250]:
columns = ["PhD Student", "Expert-Level", "PhD-Level"]
annotations_ks = {
    "PhD Student":queries_kps[1],
    "Expert-Level":queries_kps[2], 
    "PhD-Level":queries_kps[3], 
    "Bart-Large-KP20K":queries_kps["bart"], 
}

df = matrix_annotations(annotations_ks, func=anotation_distance, cols=columns); df
#print(df.to_latex(float_format="%.2f"))

Unnamed: 0,PhD Student,Expert-Level,PhD-Level
PhD Student,1.0,0.976471,1.505882
Expert-Level,0.976471,1.0,1.705882
PhD-Level,1.505882,1.705882,1.0
Bart-Large-KP20K,2.023529,2.247059,1.458824


In [251]:
pd.DataFrame({
    "Annotator 1": np.mean([len(kws) for kws in queries_kps[1][0]]),
    "Annotator 2": np.mean([len(kws) for kws in queries_kps[2][0]]), 
    "Annotator 3": np.mean([len(kws) for kws in queries_kps[3][0]]), 
    "Bart-Large-KP20K": np.mean([len(kws) for kws in queries_kps["bart"][0]])
}, index=["Average number of KP"]).transpose()

Unnamed: 0,Average number of KP
Annotator 1,2.705882
Annotator 2,2.341176
Annotator 3,3.835294
Bart-Large-KP20K,4.447059


## Agreement + comparison to baselines

In [252]:
nb_kps_to_generate = [math.floor(np.mean([len(annot1),len(annot2), len(annot3)])) for annot1, annot2, annot3 in zip(annotator1_queries, annotator2_queries, annotator3_queries)]

In [253]:
firstphrase_title_keyphrases = []

for i, title in enumerate(reading_lists["title"]):
    extractor = pke.unsupervised.FirstPhrases()
    extractor.load_document(input=title, language='en')
    extractor.candidate_selection()
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=nb_kps_to_generate[i])
    firstphrase_title_keyphrases.append([kp[0] for kp in keyphrases])

In [254]:
firstphrase_keyphrases = []

for i, (title, abstract) in enumerate(zip(reading_lists["title"], reading_lists["abstract"])):
    content =  " ".join([title, (abstract or "")])
    extractor = pke.unsupervised.FirstPhrases()
    extractor.load_document(input=content, language='en')
    extractor.candidate_selection()
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=nb_kps_to_generate[i])
    firstphrase_keyphrases.append([kp[0] for kp in keyphrases])

In [255]:
tfidf_keyphrases = []

for i, (title, abstract) in enumerate(zip(reading_lists["title"], reading_lists["abstract"])):
    content =  " ".join([title, (abstract or "")])
    extractor = pke.unsupervised.TfIdf()
    extractor.load_document(input=content, language='en')
    extractor.candidate_selection()
    extractor.candidate_weighting(df={"--NB_DOC--":1})
    keyphrases = extractor.get_n_best(n=nb_kps_to_generate[i])
    tfidf_keyphrases.append([kp[0] for kp in keyphrases])

In [256]:
random_keyphrases = []

for i, (title, abstract) in enumerate(zip(reading_lists["title"], reading_lists["abstract"])):
    content =  " ".join([title, (abstract or "")])
    extractor = pke.unsupervised.TfIdf()
    extractor.load_document(input=content, language='en')
    extractor.candidate_selection()
    
    random_kps = random.sample(list(extractor.candidates.items()), nb_kps_to_generate[i])
    random_keyphrases.append([kp[0] for kp in random_kps])

In [257]:
columns = ["PhD Student", "Expert-Level", "PhD-Level"]
annotations = {
    "PhD Student":annotator1_queries,
    "Expert-Level":annotator2_queries, 
    "PhD-Level":annotator3_queries, 

    "FirstPhrase Title":firstphrase_title_keyphrases, 
    "FirstPhrase":firstphrase_keyphrases, 
    "Bart-Large-KP20K":annotator_bart_queries,
    "TFIDF":tfidf_keyphrases, 
    "Random":random_keyphrases
}

In [258]:
df = matrix_annotations(annotations, func=word_exact_match, cols=columns); df
#print(df.to_latex(float_format="%.2f"))

Unnamed: 0,PhD Student,Expert-Level,PhD-Level
PhD Student,1.0,0.524606,0.287603
Expert-Level,0.524606,1.0,0.278393
PhD-Level,0.287603,0.278393,1.0
FirstPhrase Title,0.372172,0.383803,0.228551
FirstPhrase,0.349599,0.359895,0.220304
Bart-Large-KP20K,0.407363,0.378023,0.318741
TFIDF,0.294271,0.409892,0.186841
Random,0.107244,0.105109,0.060595


# Differences between anotators

In [259]:
def check_differences(preds, manual_annotations, threshold=0.05):
    for k, annotations in manual_annotations.items():
        for annotation_manual, annotation_preds in zip(annotations, preds):
            if word_exact_match(annotation_manual, annotation_preds)<threshold:
                print(k, annotation_manual, annotation_preds)

manual_annotations = {
    "PhD Student":annotator1_queries,
    "Expert-Level":annotator2_queries, 
    "PhD-Level":annotator3_queries, 
}

In [260]:
check_differences(annotator1_queries, manual_annotations)

Expert-Level ['reproducibility', 'machine learning (ML)', 'nlp'] ['research reproductibility', 'natural language processing (NLP)', 'computational linguistics']
Expert-Level ['annotation', 'cross-cultural differences', 'language technologies'] ['geo-cultural representation', 'socio-cultural identity', 'reinforcement learning from human feedback (RLHF)']
PhD-Level ['annotation bias', 'natural language processing (nlp)', 'subjectivity', 'socio cultural annotation'] ['geo-cultural representation', 'socio-cultural identity', 'reinforcement learning from human feedback (RLHF)']


In [261]:
check_differences(annotator2_queries, manual_annotations)

PhD Student ['research reproductibility', 'natural language processing (NLP)', 'computational linguistics'] ['reproducibility', 'machine learning (ML)', 'nlp']
PhD Student ['geo-cultural representation', 'socio-cultural identity', 'reinforcement learning from human feedback (RLHF)'] ['annotation', 'cross-cultural differences', 'language technologies']
PhD-Level ['meta learning', 'survey', 'natural language processing (nlp)'] ['meta-learning', 'nlp']
PhD-Level ['knowledge graph', 'accessibility', 'structuration'] ['DBpedia Databus']
PhD-Level ['language model', 'formal semantics', 'distributional models'] ['text representation', 'text embedding', 'control', 'analysis']


In [262]:
check_differences(annotator3_queries, manual_annotations)

PhD Student ['geo-cultural representation', 'socio-cultural identity', 'reinforcement learning from human feedback (RLHF)'] ['annotation bias', 'natural language processing (nlp)', 'subjectivity', 'socio cultural annotation']
Expert-Level ['meta-learning', 'nlp'] ['meta learning', 'survey', 'natural language processing (nlp)']
Expert-Level ['DBpedia Databus'] ['knowledge graph', 'accessibility', 'structuration']
Expert-Level ['text representation', 'text embedding', 'control', 'analysis'] ['language model', 'formal semantics', 'distributional models']


## Differences between automatic and manual

In [263]:
check_differences(annotator_bart_queries, manual_annotations)

PhD Student ['human-nlp model interactions'] ['natural language processing', 'human-in-the-loop usability evaluation', 'model-based user interface design']
Expert-Level ['human-NLP model interactions'] ['natural language processing', 'human-in-the-loop usability evaluation', 'model-based user interface design']
PhD-Level ['zero shot learning', 'few shot learning', 'review'] ['few-shot', 'language models', 'zero-shot']
PhD-Level ['socially aware', 'natural language processing (nlp)', 'human-level aspects'] ['contextualization', 'human-centered nlp', 'societal nlp']


# Sentence queries

In [264]:
queries_sentences = {}
for annotator_i in [1,2,3]:
    annotator_sentences = [query.strip() for query in pd.read_csv(f"annotations/annotation_{annotator_i}.csv")["query_sentence"].replace(np.nan, "")]
    queries_sentences[annotator_i] = [[clean_string(stem_string(w)) for w in s.split() if w not in stops] for s in annotator_sentences]    

In [265]:
columns = ["PhD Student", "Expert-Level", "PhD-Level"]
annotations_sentences = {
    "PhD Student":queries_sentences[1],
    "Expert-Level":queries_sentences[2], 
    "PhD-Level":queries_sentences[3], 
}