1. Download `Krapivin2009` dataset with `git clone https://github.com/INESCTEC/KeywordExtractor-Datasets.git`.

2. Load the dataset.

In [10]:
from pathlib import Path

DATASET_DIR = Path("Krapivin2009")

def load_krapivin(dataset_dir: Path, max_docs=None):
    """
    Load Krapivin2009 documents.
    Returns a list of dicts {id, text, gold}.
    max_docs allows loading only a subset (for testing).
    """
    docs = []
    text_files = sorted(dataset_dir.rglob("*.txt"))

    for i, text_path in enumerate(text_files):
        if max_docs is not None and i >= max_docs:
            break

        doc_id = text_path.stem

        # Look for the keywords file with the same name
        key_path = None
        for ext in (".key", ".uncontr", ".keywords"):
            candidate = text_path.with_suffix(ext)
            if candidate.exists():
                key_path = candidate
                break

        # If not found nearby, search elsewhere in the folder
        if key_path is None:
            matches = list(dataset_dir.rglob(doc_id + ".key"))
            if matches:
                key_path = matches[0]

        if key_path is None:
            # No gold for this doc, we ignore it
            continue

        text = text_path.read_text(encoding="utf-8", errors="ignore")
        gold = [
            line.strip()
            for line in key_path.read_text(encoding="utf-8", errors="ignore").splitlines()
            if line.strip()
        ]

        docs.append({"id": doc_id, "text": text, "gold": gold})

    return docs

# Load the dataset (adjust max_docs to go faster)
# By default, there are 2304 documents
docs = load_krapivin(DATASET_DIR, max_docs=30)
len(docs)
docs[0]

{'id': '1005058',
 'text': '--T\nEnhancing Product Recommender Systems on Sparse Binary Data.\n--A\nCommercial recommender systems use various data mining techniques to make appropriate recommendations to users during online, real-time sessions. Published algorithms focus more on the discrete user ratings instead of binary results, which hampers their predictive capabilities when usage data is sparse. The system proposed in this paper, e-VZpro, is an association mining-based recommender tool designed to overcome these problems through a two-phase approach. In the first phase, batches of customer historical data are analyzed through association mining in order to determine the association rules for the second phase. During the second phase, a scoring algorithm is used to rank the recommendations online for the customer. The second phase differs from the traditional approach and an empirical comparison between the methods used in e-VZpro and other collaborative filtering methods includin

3. Apply 3 keyphrase extraction algorithms for each document in the dataset.
We chose the following algorithms:
- TfIdf (statistical method)
- TextRank (graph-based + PageRank)
- KeyBERT

In [11]:
import time
import string

import nltk
from nltk.corpus import stopwords
import pke

# Make sure English stopwords are available
try:
    stopwords.words("english")
except LookupError:
    nltk.download("stopwords")

STOPLIST = stopwords.words("english") + list(string.punctuation)
POS_TAGS = {"NOUN", "PROPN", "ADJ"}


def extract_kp(model_cls, text, n=10):
    """
    Apply a pke model (TfIdf, TextRank...)
    on a text and return the n best keyphrases.
    """
    extractor = model_cls()

    # Special case for TfIdf
    if model_cls is pke.unsupervised.TfIdf:
        extractor.load_document(
            input=text,
            language="en",
            stoplist=STOPLIST,
            normalization=None,
        )
        extractor.candidate_selection(n=3)
        extractor.candidate_weighting()

    # TextRank (word graph)
    elif model_cls is pke.unsupervised.TextRank:
        extractor.load_document(
            input=text,
            language="en",
            normalization=None,
        )
        extractor.candidate_selection(pos=POS_TAGS)
        extractor.candidate_weighting()

    # Generic fallback for other pke models
    else:
        extractor.load_document(
            input=text,
            language="en",
            normalization=None,
        )
        extractor.candidate_selection()
        extractor.candidate_weighting()

    return [kp for kp, score in extractor.get_n_best(n=n)]


def run_model_on_corpus(model_cls, field_name, n=10):
    start = time.time()
    for doc in docs:
        doc[field_name] = extract_kp(model_cls, doc["text"], n=n)
    duration = time.time() - start
    print(
        f"{field_name}: {len(docs)} docs processed, "
        f"{duration/len(docs):.3f} s/doc on average"
    )


# Run the 2 algorithms (top-10 keyphrases)
# run_model_on_corpus(pke.unsupervised.TfIdf,    "pred_tfidf",    n=10)
# run_model_on_corpus(pke.unsupervised.TextRank, "pred_textrank", n=10)

In [12]:
from keybert import KeyBERT

kw_model = KeyBERT(model="all-MiniLM-L6-v2")  # lightweight model

def extract_kp_keybert(text, n=10):
    kps = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words="english",
        top_n=n
    )
    return [kp for kp, score in kps]

# for doc in docs:
#     doc["pred_keybert"] = extract_kp_keybert(doc["text"], n=10)

In [13]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import time
import numpy as np

def process_doc_all_methods(doc, n=10):
    """
    Applique les trois méthodes (TfIdf, TextRank, KeyBERT)
    sur un document et renvoie les listes de keyphrases.
    """
    text = doc["text"]
    timings = {}

    # TF-IDF
    start_tfidf = time.time()
    pred_tfidf = extract_kp(pke.unsupervised.TfIdf, text, n=n)
    timings["tfidf"] = time.time() - start_tfidf

    # TextRank
    start_textrank = time.time()
    pred_textrank = extract_kp(pke.unsupervised.TextRank, text, n=n)
    timings["textrank"] = time.time() - start_textrank

    # KeyBERT
    start_keybert = time.time()
    pred_keybert = extract_kp_keybert(text, n=n)
    timings["keybert"] = time.time() - start_keybert

    return doc["id"], pred_tfidf, pred_textrank, pred_keybert, timings


def run_all_methods_in_parallel(docs, n=10, max_workers=None):
    """
    Lance le traitement de tous les documents en parallèle,
    en utilisant jusqu'à max_workers threads CPU.
    Pour chaque doc, on calcule pred_tfidf, pred_textrank, pred_keybert.
    """
    if max_workers is None:
        max_workers = os.cpu_count() or 4

    start = time.time()

    # Pour retrouver les docs facilement par id
    id2doc = {doc["id"]: doc for doc in docs}

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(process_doc_all_methods, doc, n)
            for doc in docs
        ]

        all_timings = []

        for fut in as_completed(futures):
            doc_id, pred_tfidf, pred_textrank, pred_keybert, timings = fut.result()
            d = id2doc[doc_id]
            d["pred_tfidf"]    = pred_tfidf
            d["pred_textrank"] = pred_textrank
            d["pred_keybert"]  = pred_keybert
            all_timings.append(timings)

    duration = time.time() - start
    print(
        f"{len(docs)} docs processed in parallel "
        f"({duration/len(docs):.3f} s/doc on average, "
        f"{max_workers} workers)"
    )
    tfidf_times = [t["tfidf"] for t in all_timings]
    textrank_times = [t["textrank"] for t in all_timings]
    keybert_times = [t["keybert"] for t in all_timings]

    print(f"TF-IDF: {np.mean(tfidf_times):.4f} s/doc")
    print(f"TextRank: {np.mean(textrank_times):.4f} s/doc")
    print(f"KeyBERT: {np.mean(keybert_times):.4f} s/doc")


# Lancer le traitement parallèle (top-10 keyphrases)
run_all_methods_in_parallel(docs, n=10, max_workers=28)




29 docs processed in parallel (8.492 s/doc on average, 28 workers)
TF-IDF: 101.4966 s/doc
TextRank: 30.2904 s/doc
KeyBERT: 93.5499 s/doc


In [14]:
import json
from pathlib import Path

# Dossier et nom du fichier de sortie
OUTPUT_PATH = Path("krapivin_kpe_results.json")

export_data = []

for doc in docs:
    export_data.append(
        {
            "id": doc["id"],
            # mots-clés de référence (auteur)
            "gold": doc.get("gold", []),
            # keyphrases prédites par chaque méthode
            "pred_tfidf": doc.get("pred_tfidf", []),
            "pred_textrank": doc.get("pred_textrank", []),
            "pred_keybert": doc.get("pred_keybert", []),
        }
    )

with OUTPUT_PATH.open("w", encoding="utf-8") as f:
    json.dump(export_data, f, ensure_ascii=False, indent=2)

print(f"Exporté {len(export_data)} documents dans {OUTPUT_PATH.resolve()}")

# Afficher un exemple pour vérifier
export_data[0]


Exporté 29 documents dans /home/nathan/Documents/GitHub/KeyPhraseExtraction/krapivin_kpe_results.json


{'id': '1005058',
 'gold': ['collaborative filtering',
  'customer relationship management',
  'e-commerce',
  'recommender systems',
  'dependency networks',
  'association mining'],
 'pred_tfidf': ['recommender',
  'customer',
  'systems',
  'association',
  'used',
  'e-vzpro',
  'recommender systems',
  'mining',
  'rules',
  'dependency'],
 'pred_textrank': ['enhancing product recommender systems',
  'historical customer data',
  'customer historical data',
  'customer purchases product a',
  'customer profile data',
  'sample customer data',
  'customer purchase data',
  'other item-based recommender systems',
  'method accuracy time accuracy time accuracy time',
  'other model-based recommender systems'],
 'pred_keybert': ['association mining recommender',
  'mining based recommender',
  'mining recommender',
  'recommender frequent items',
  'collaborative filtering technique',
  'items products recommender',
  'collaborative filtering algorithm',
  'recommender tool designed',

4. Evaluate with ROUGE and BERTScore

In [15]:
refs = []
cands_tfidf = []
cands_textrank = []
cands_keybert = []

for doc in docs:
    refs.append(", ".join(doc["gold"]))
    cands_tfidf.append(", ".join(doc["pred_tfidf"]))
    cands_textrank.append(", ".join(doc["pred_textrank"]))
    cands_keybert.append(", ".join(doc["pred_keybert"]))

In [16]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

def eval_model(cands):
    scores = []
    for ref, cand in zip(refs, cands):
        scores.append(scorer.score(ref, cand)["rougeL"].fmeasure)
    return sum(scores) / len(scores)

rouge_tfidf    = eval_model(cands_tfidf)
rouge_textrank = eval_model(cands_textrank)
rouge_keybert  = eval_model(cands_keybert)

print("Average ROUGE-L")
print(f"  TfIdf   : {rouge_tfidf:.4f}")
print(f"  TextRank: {rouge_textrank:.4f}")
print(f"  KeyBERT : {rouge_keybert:.4f}")

Average ROUGE-L
  TfIdf   : 0.2131
  TextRank: 0.1144
  KeyBERT : 0.1681


In [17]:
from bert_score import score as bert_score

def avg_bertscore(cands, refs):
    P, R, F1 = bert_score(cands, refs, lang="en")
    return float(F1.mean())

bert_tfidf    = avg_bertscore(cands_tfidf, refs)
bert_textrank = avg_bertscore(cands_textrank, refs)
bert_keybert  = avg_bertscore(cands_keybert, refs)

print("Average F1 BERTScore")
print(f"  TfIdf   : {bert_tfidf:.4f}")
print(f"  TextRank: {bert_textrank:.4f}")
print(f"  KeyBERT : {bert_keybert:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average F1 BERTScore
  TfIdf   : 0.8643
  TextRank: 0.8293
  KeyBERT : 0.8430
