In [41]:
import re
import pke
import math
import nltk
import random
import numpy as np
import pandas as pd
from nltk import stem
from nltk.corpus import stopwords
 
nltk.download('stopwords')
stops = stopwords.words('english')

stemmer = stem.PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/e154817e/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
reading_lists = pd.read_csv("../reading_lists.csv")
reading_lists = reading_lists.replace(np.nan, None)

def split_queries_keyphrases(queries):
    queries = queries.replace(np.nan, "")
    return [query.strip().split(", ") if query.strip().split(", ") != [''] else [] for query in queries]

annotator1_queries = split_queries_keyphrases(pd.read_csv("annotations/annotation_1.csv")["query_keywords"])
annotator2_queries = split_queries_keyphrases(pd.read_csv("annotations/annotation_2.csv")["query_keywords"])
annotator3_queries = split_queries_keyphrases(pd.read_csv("annotations/annotation_3.csv")["query_keywords"])
annotator_scholar_queries = split_queries_keyphrases(pd.read_csv("annotations/annotation_scholar.csv")["query_keywords"])

# Keyphrases query

## KP number difference

In [43]:
def anotation_distance(annot_a, annot_b):
    return sum([abs(len(kp_a) - len(kp_b)) for kp_a, kp_b in zip(annot_a, annot_b)]) / len(reading_lists)

In [44]:
pd.DataFrame({
    "Annotator 1 vs 2":anotation_distance(annotator1_queries, annotator2_queries),
    "Annotator 1 vs 3":anotation_distance(annotator1_queries, annotator3_queries), 
    "Annotator 2 vs 3":anotation_distance(annotator2_queries, annotator3_queries), 
}, index=["Average difference"]).transpose()

Unnamed: 0,Average difference
Annotator 1 vs 2,0.976471
Annotator 1 vs 3,1.505882
Annotator 2 vs 3,1.705882


In [45]:
pd.DataFrame({
    "Annotator 1 vs Semantic Scholar":anotation_distance(annotator1_queries, annotator_scholar_queries),
    "Annotator 2 vs Semantic Scholar":anotation_distance(annotator2_queries, annotator_scholar_queries), 
    "Annotator 3 vs Semantic Scholar":anotation_distance(annotator3_queries, annotator_scholar_queries), 
}, index=["Average difference"]).transpose()

Unnamed: 0,Average difference
Annotator 1 vs Semantic Scholar,3.423529
Annotator 2 vs Semantic Scholar,3.741176
Annotator 3 vs Semantic Scholar,2.929412


In [46]:
pd.DataFrame({
    "Annotator 1": np.mean([len(kws) for kws in annotator1_queries]),
    "Annotator 2": np.mean([len(kws) for kws in annotator2_queries]), 
    "Annotator 3": np.mean([len(kws) for kws in annotator3_queries]), 
    "Semantic Scholar": np.mean([len(kws) for kws in annotator_scholar_queries])
}, index=["Average number of KP"]).transpose()

Unnamed: 0,Average number of KP
Annotator 1,2.705882
Annotator 2,2.341176
Annotator 3,3.835294
Semantic Scholar,4.976471


## Statistical extraction baselines

In [47]:
nb_kps_to_generate = [math.floor(np.mean([len(annot1),len(annot2), len(annot3)])) for annot1, annot2, annot3 in zip(annotator1_queries, annotator2_queries, annotator3_queries)]

In [48]:
firstphrase_keyphrases = []

for i, (title, abstract) in enumerate(zip(reading_lists["title"], reading_lists["abstract"])):
    content =  " ".join([title, (abstract or "")])
    extractor = pke.unsupervised.FirstPhrases()
    extractor.load_document(input=content, language='en')
    extractor.candidate_selection()
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=nb_kps_to_generate[i])
    firstphrase_keyphrases.append([kp[0] for kp in keyphrases])

In [49]:
tfidf_keyphrases = []

for i, (title, abstract) in enumerate(zip(reading_lists["title"], reading_lists["abstract"])):
    content =  " ".join([title, (abstract or "")])
    extractor = pke.unsupervised.TfIdf()
    extractor.load_document(input=content, language='en')
    extractor.candidate_selection()
    extractor.candidate_weighting(df={"--NB_DOC--":1})
    keyphrases = extractor.get_n_best(n=nb_kps_to_generate[i])
    tfidf_keyphrases.append([kp[0] for kp in keyphrases])

In [50]:
random_keyphrases = []

for i, (title, abstract) in enumerate(zip(reading_lists["title"], reading_lists["abstract"])):
    content =  " ".join([title, (abstract or "")])
    extractor = pke.unsupervised.TfIdf()
    extractor.load_document(input=content, language='en')
    extractor.candidate_selection()
    
    random_kps = random.sample(list(extractor.candidates.items()), nb_kps_to_generate[i])
    random_keyphrases.append([kp[0] for kp in random_kps])

## Exact match

In [51]:
def clean_string(string):
    return re.sub(r'\W+','', string).lower() 
    
def stem_string(string):
    return stemmer.stem(string, to_lowercase=True)
    
def list_word_process(lst):
    return [stem_string(word) for kp in lst for word in kp.split() if stem_string(word) not in stops]

def word_exact_match(lst1, lst2):
    return exact_match(list_word_process(lst1), list_word_process(lst2))
    
def exact_match(lst1, lst2):
    if lst1 == lst2:
        return 1
    lst1 = list(map(stem_string, lst1))
    lst2 = list(map(stem_string, lst2))
    inter = list(set(lst1) & set(lst2))
    union = list(set(lst1) | set(lst2))
    return len(inter) / len(union) if len(union) > 0 else 0

def score(annotator1_keyphrases, annotator2_keyphrases, func=exact_match):
    return sum([func(anot1, anot2) for anot1, anot2 in zip(annotator1_keyphrases, annotator2_keyphrases)])/ max(len(annotator1_keyphrases), len(annotator2_keyphrases))

def matrix_annotations(annotations, names, func = exact_match):
    matrix = []
    for annot_a in annotations:
        row = []
        for annot_b in annotations:
            row.append(score(annot_a, annot_b, func=func))
        matrix.append(row)
    return pd.DataFrame(matrix, index=names, columns=names)

In [62]:
annotations = {
    "PhD Student":annotator1_queries,
    "Expert-Level":annotator2_queries, 
    "PhD-Level":annotator3_queries, 
    "Semantic Scholar KP":annotator_scholar_queries, 
    "FirstPhrase KP":firstphrase_keyphrases, 
    "TFIDF KP":tfidf_keyphrases, 
    "Random KP":random_keyphrases
}

In [63]:
df = matrix_annotations(list(annotations.values()), list(annotations.keys()), func=exact_match).drop(["Semantic Scholar KP", "FirstPhrase KP", "TFIDF KP", "Random KP"], axis=1)
df
#print(df.to_latex(float_format="%.2f"))

Unnamed: 0,PhD Student,Expert-Level,PhD-Level
PhD Student,1.0,0.316709,0.073842
Expert-Level,0.316709,1.0,0.111134
PhD-Level,0.073842,0.111134,1.0
Semantic Scholar KP,0.083906,0.078013,0.061596
FirstPhrase KP,0.109524,0.123016,0.04077
TFIDF KP,0.053492,0.123142,0.054916
Random KP,0.0,0.0,0.001471


In [71]:
df = matrix_annotations(list(annotations.values()), list(annotations.keys()), func=word_exact_match).drop(["Semantic Scholar KP", "FirstPhrase KP", "TFIDF KP", "Random KP"], axis=1)
df
#print(df.to_latex(float_format="%.2f"))

Unnamed: 0,PhD Student,Expert-Level,PhD-Level
PhD Student,1.0,0.524606,0.287603
Expert-Level,0.524606,1.0,0.278393
PhD-Level,0.287603,0.278393,1.0
Semantic Scholar KP,0.21069,0.172885,0.206158
FirstPhrase KP,0.349599,0.359895,0.220304
TFIDF KP,0.294271,0.409892,0.186841
Random KP,0.078902,0.07501,0.065405


# Sentence query

In [65]:
def clean_string(string):
    return re.sub(r'\W+','', string).lower() 
    
def stem_string(string):
    return stemmer.stem(string, to_lowercase=True)

In [66]:
annotator1_sentences = [query.strip() for query in pd.read_csv("annotations/annotation_1.csv")["query_sentence"].replace(np.nan, "")]
annotator2_sentences = [query.strip() for query in pd.read_csv("annotations/annotation_2.csv")["query_sentence"].replace(np.nan, "")]
annotator3_sentences = [query.strip() for query in pd.read_csv("annotations/annotation_3.csv")["query_sentence"].replace(np.nan, "")]

In [67]:
annotator1_sentences_kw = [[clean_string(stem_string(w)) for w in s.split() if w not in stops] for s in annotator1_sentences]
annotator2_sentences_kw = [[clean_string(stem_string(w)) for w in s.split() if w not in stops] for s in annotator2_sentences]
annotator3_sentences_kw = [[clean_string(stem_string(w)) for w in s.split() if w not in stops] for s in annotator3_sentences]

In [72]:
annotations_sentences = {
    "PhD Student":annotator1_sentences_kw,
    "Expert-Level":annotator2_sentences_kw, 
    "PhD-Level":annotator3_sentences_kw, 
}

In [76]:
df = matrix_annotations(list(annotations_sentences.values()), list(annotations_sentences.keys()), func=exact_match)
df
#print(df.to_latex(float_format="%.2f"))

Unnamed: 0,PhD Student,Expert-Level,PhD-Level
PhD Student,1.0,0.597351,0.402873
Expert-Level,0.597351,1.0,0.398665
PhD-Level,0.402873,0.398665,1.0
