In [1]:
!pip install git+https://github.com/boudinfl/pke.git
!pip install datasets
!pip install ipywidgets
!pip install nltk
!python -m spacy download en_core_web_sm

Collecting git+https://github.com/boudinfl/pke.git
  Cloning https://github.com/boudinfl/pke.git to /private/var/folders/gf/q45ftd6d5z7c8tcytwj5czh80000gp/T/pip-req-build-hxjf8dv0
  Running command git clone --filter=blob:none --quiet https://github.com/boudinfl/pke.git /private/var/folders/gf/q45ftd6d5z7c8tcytwj5czh80000gp/T/pip-req-build-hxjf8dv0
  Resolved https://github.com/boudinfl/pke.git to commit dfe5dec971389802247f2549963e92b898e88c69
  Preparing metadata (setup.py) ... [?25ldone
You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/hands-on-with-pke/venv/bin/python -m pip install --upgrade pip' command.[0m[33m


You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/hands-on-with-pke/venv/bin/python -m pip install --upgrade pip' command.[0m[33m


You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/hands-on-with-pke/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/hands-on-with-pke/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mCollecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/hands-on-with-pke/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Hands-on session with pke - part 3

This notebook provides an end-to-end example of model benchmarking on Inspec, a commonly-used dataset for keyphrase extraction that contains bibliographic records (i.e. title/abstract from scientific papers).

## Preamble on keyphrase extraction datasets using 🤗 datasets

For simplicity and ease of use, we rely on the `datasets` module from 🤗 huggingface to load and access sample documents from the inspec dataset. 



In [2]:
from datasets import load_dataset

# load the inspec dataset
dataset = load_dataset('taln-ls2n/inspec')

# let's have a look at one sample document from the validation split
sample = dataset["validation"][0]

print("id: {}".format(sample["id"]))
print("title: {}...".format(sample["title"][:50]))
print("abstract: {}...".format(sample["abstract"][:50]))
print("gold-standard keyphrases: {}; ...".format("; ".join(sample["keyphrases"][:3])))

No config specified, defaulting to: inspec/raw
Reusing dataset inspec (/Users/boudin-f/.cache/huggingface/datasets/taln-ls2n___inspec/raw/1.1.0/0ae146cabe770846946b3279b4c751efe0aca2dd68b3f24427d4624cd22bb20d)


  0%|          | 0/3 [00:00<?, ?it/s]

id: 1833
title: British Standard 7666 as a framework for geocoding...
abstract: The article examines the role of British Standard ...
gold-standard keyphrases: British Standard 7666; geocoding; property information; ...


## Benchmarking models

### step-1: let's start by preprocessing the dataset using spacy and nltk

In [3]:
import re
import spacy
from tqdm.notebook import tqdm
from nltk.stem.snowball import SnowballStemmer as Stemmer
from spacy.tokenizer import _get_regex_pattern

nlp = spacy.load("en_core_web_sm")

# Tokenization fix for in-word hyphens (e.g. 'non-linear' would be kept 
# as one token instead of default spacy behavior of 'non', '-', 'linear')
# https://spacy.io/usage/linguistic-features#native-tokenizer-additions

from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

# Modify tokenizer infix patterns
infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # ✅ Commented out regex that splits on hyphens between letters:
        # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

# populates a docs list with spacy doc objects
docs = []
for sample in tqdm(dataset['test']):
    docs.append(nlp(sample["title"]+". "+sample["abstract"]))

# populates the references list with stemmed keyphrases
references = []
for sample in tqdm(dataset['test']):
    sample_keyphrases = []
    for keyphrase in sample["keyphrases"]:
        # tokenize keyphrase
        tokens = [token.text for token in nlp(keyphrase)]
        # normalize tokens using Porter's stemming
        stems = [Stemmer('porter').stem(tok.lower()) for tok in tokens]
        sample_keyphrases.append(" ".join(stems))
    references.append(sample_keyphrases)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

### step-2: run the desired models on the dataset and store extracted keyphrases

In [4]:
from pke.unsupervised import *

outputs = {}
for model in [FirstPhrases, TopicRank, PositionRank]:
    outputs[model.__name__] = []
    
    extractor = model()
    for i, doc in enumerate(tqdm(docs)):
        extractor.load_document(input=doc, language='en')
        extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
        extractor.candidate_weighting()
        outputs[model.__name__].append([u for u,v in extractor.get_n_best(n=5, stemming=True)])

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

### step-3: evaluate the performance of each model

In [5]:
import numpy as np

def evaluate(top_N_keyphrases, references):
    P = len(set(top_N_keyphrases) & set(references)) / len(top_N_keyphrases)
    R = len(set(top_N_keyphrases) & set(references)) / len(references)
    F = (2*P*R)/(P+R) if (P+R) > 0 else 0 
    return (P, R, F)

# loop through the models
for model in outputs:
    
    # compute the P, R, F scores for the model
    scores = []
    for i, output in enumerate(tqdm(outputs[model])):
        scores.append(evaluate(output, references[i]))
    
    # compute the average scores
    avg_scores = np.mean(scores, axis=0)
    
    # print out the performance of the model
    print("Model: {} P@5: {:.3f} R@5: {:.3f} F@5: {:.3f}".format(model, avg_scores[0], avg_scores[1], avg_scores[2]))

  0%|          | 0/500 [00:00<?, ?it/s]

Model: FirstPhrases P@5: 0.339 R@5: 0.207 F@5: 0.242


  0%|          | 0/500 [00:00<?, ?it/s]

Model: TopicRank P@5: 0.347 R@5: 0.209 F@5: 0.246


  0%|          | 0/500 [00:00<?, ?it/s]

Model: PositionRank P@5: 0.388 R@5: 0.241 F@5: 0.279


## Conclusion

Benchmarking keyphrase extraction models in pke is quite easy 😀. You are now ready for applying keyphrase extraction models on other datasets.