In [1]:
# Install necessary packages (run these cells if not already installed)
!pip install stanfordcorenlp==3.9.1.1
!pip install torchvision

# Imports
import re
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel, T5Tokenizer
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from torch.utils.data import Dataset
from stanfordcorenlp import StanfordCoreNLP
from tqdm import tqdm
import json
import os
import sys
import codecs

Collecting stanfordcorenlp==3.9.1.1
  Downloading stanfordcorenlp-3.9.1.1-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading stanfordcorenlp-3.9.1.1-py2.py3-none-any.whl (5.7 kB)
Installing collected packages: stanfordcorenlp
Successfully installed stanfordcorenlp-3.9.1.1


In [2]:
# Check working directory (optional)
!pwd

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

class MDERank:
    def __init__(self, model_name="bert-base-uncased", pooling="max"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.pooling = pooling

    def compute_embedding(self, text):
        # Tokenize and get model output
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        # Get last hidden states: [batch_size, sequence_length, hidden_size]
        hidden_states = outputs.last_hidden_state[0]  # (seq_len, hidden_size)
        if self.pooling == "max":
            embedding, _ = torch.max(hidden_states, dim=0)
        elif self.pooling == "avg":
            embedding = torch.mean(hidden_states, dim=0)
        else:
            embedding = torch.mean(hidden_states, dim=0)
        return embedding.numpy()

    def extract_candidates(self, text):
        """
        Uses NLTK to tokenize text, assign POS tags, and extract candidate phrases
        based on sequences of adjectives (JJ) or nouns (NN).
        """
        tokens = word_tokenize(text)
        tagged = pos_tag(tokens)
        candidates = []
        candidate = []
        for word, tag in tagged:
            if tag.startswith("JJ") or tag.startswith("NN"):
                candidate.append(word)
            else:
                if candidate:
                    phrase = " ".join(candidate)
                    candidates.append(phrase)
                    candidate = []
        if candidate:
            phrase = " ".join(candidate)
            candidates.append(phrase)
        # Remove duplicates and keep phrases with at least one word
        candidates = list(set([c for c in candidates if len(c.split()) >= 1]))
        return candidates

    def mask_text(self, text, candidate):
        """
        Replaces occurrences of candidate in text with [MASK] tokens.
        """
        candidate_tokens = candidate.split()
        mask_token = " ".join(["[MASK]"] * len(candidate_tokens))
        # Use regex for case-insensitive replacement
        pattern = re.compile(re.escape(candidate), re.IGNORECASE)
        masked_text = pattern.sub(mask_token, text)
        return masked_text

    def cosine_similarity(self, vec1, vec2):
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2) + 1e-8)

    def rank_keyphrases(self, text):
        """
        Computes the embedding for the original text and, for each candidate keyphrase,
        computes the embedding for the masked text. The candidate whose masking causes a
        larger drop in cosine similarity (i.e. lower similarity) is considered more important.
        """
        original_embedding = self.compute_embedding(text)
        candidates = self.extract_candidates(text)
        scores = {}
        for candidate in candidates:
            masked_text = self.mask_text(text, candidate)
            masked_embedding = self.compute_embedding(masked_text)
            sim = self.cosine_similarity(original_embedding, masked_embedding)
            scores[candidate] = sim
        # Sort candidates by increasing similarity
        ranked = sorted(scores.items(), key=lambda x: x[1])
        return ranked

/kaggle/working
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
def extract_keyphrases(text, top_k=10):
    """
    Extract keyphrases from the input text. Returns the top_k keyphrases with the lowest similarity scores.
    """
    mde = MDERank()
    ranked = mde.rank_keyphrases(text.lower())
    top_candidates = [phrase for phrase, score in ranked[:top_k]]
    return top_candidates

In [4]:
def clean_labels(labels):
    clean_labels = {}
    for id in labels:
        label = labels[id]
        clean_label = []
        for kp in label:
            if ";" in kp:
                left, right = kp.split(";")
                clean_label.append(left)
                clean_label.append(right)
            else:
                clean_label.append(kp)
        clean_labels[id] = clean_label        
    return clean_labels

def get_long_data(file_path="/kaggle/input/keypharses-extraction-dataset/data/nus/nus_test.json"):
    """Load file.jsonl for long documents."""
    data = {}
    labels = {}
    with codecs.open(file_path, 'r', 'utf-8') as f:
        json_text = f.readlines()
        for i, line in tqdm(enumerate(json_text), desc="Loading Doc ..."):
            try:
                jsonl = json.loads(line)
                keywords = jsonl['keywords'].lower().split(";")
                abstract = jsonl['abstract']
                fulltxt = jsonl['fulltext']
                doc = ' '.join([abstract, fulltxt])
                doc = re.sub('\. ', ' . ', doc)
                doc = re.sub(', ', ' , ', doc)
                doc = doc.replace('\n', ' ')
                data[jsonl['name']] = doc
                labels[jsonl['name']] = keywords
            except Exception as e:
                raise ValueError(f"Error processing line {i}: {e}")
    labels = clean_labels(labels)
    return data, labels

def get_duc2001_data(file_path="/kaggle/input/keypharses-extraction-dataset/data/DUC2001"):
    pattern = re.compile(r'<TEXT>(.*?)</TEXT>', re.S)
    data = {}
    labels = {}
    for dirname, dirnames, filenames in os.walk(file_path):
        for fname in filenames:
            if fname == "annotations.txt":
                infile = os.path.join(dirname, fname)
                with open(infile, 'rb') as f:
                    text = f.read().decode('utf8')
                    lines = text.splitlines()
                    for line in lines:
                        left, right = line.split("@")
                        d = right.split(";")[:-1]
                        labels[left] = d
            else:
                infile = os.path.join(dirname, fname)
                with open(infile, 'rb') as f:
                    text = f.read().decode('utf8')
                    text = re.findall(pattern, text)[0]
                    data[fname] = text
    labels = clean_labels(labels)
    return data, labels

def get_inspec_data(file_path="/kaggle/input/keypharses-extraction-dataset/data/Inspec"):
    data = {}
    labels = {}
    for dirname, dirnames, filenames in os.walk(file_path):
        for fname in filenames:
            left, right = fname.split('.')
            if right == "abstr":
                infile = os.path.join(dirname, fname)
                with open(infile, 'r', encoding='utf8') as f:
                    text = f.read()
                    text = text.replace("%", '')
                    data[left] = text
            if right == "uncontr":
                infile = os.path.join(dirname, fname)
                with open(infile, 'r', encoding='utf8') as f:
                    text = f.read()
                    text = text.replace("\n\t", ' ').replace("\n", ' ')
                    label = text.split("; ")
                    labels[left] = label
    labels = clean_labels(labels)
    return data, labels

def get_semeval2017_data(data_path="/kaggle/input/keypharses-extraction-dataset/data/SemEval2017/docsutf8",
                         labels_path="/kaggle/input/keypharses-extraction-dataset/data/SemEval2017/keys"):
    data = {}
    labels = {}
    for dirname, dirnames, filenames in os.walk(data_path):
        for fname in filenames:
            left, right = fname.split('.')
            infile = os.path.join(dirname, fname)
            with codecs.open(infile, "r", "utf-8") as fi:
                text = fi.read()
                text = text.replace("%", '')
            data[left] = text.lower()
    for dirname, dirnames, filenames in os.walk(labels_path):
        for fname in filenames:
            left, right = fname.split('.')
            infile = os.path.join(dirname, fname)
            with open(infile, 'rb') as f:
                text = f.read().decode('utf8').strip()
                ls = text.splitlines()
                labels[left] = ls
    labels = clean_labels(labels)
    return data, labels

def get_short_data(file_path="/kaggle/input/keypharses-extraction-dataset/data/krapivin/kravipin_test.json"):
    """Load file.jsonl for short documents."""
    data = {}
    labels = {}
    with codecs.open(file_path, 'r', 'utf-8') as f:
        json_text = f.readlines()
        for i, line in tqdm(enumerate(json_text), desc="Loading Doc ..."):
            try:
                jsonl = json.loads(line)
                keywords = jsonl['keywords'].lower().split(";")
                abstract = jsonl['abstract']
                doc = abstract
                doc = re.sub('\. ', ' . ', doc)
                doc = re.sub(', ', ' , ', doc)
                doc = doc.replace('\n', ' ').replace('\t', ' ')
                data[i] = doc
                labels[i] = keywords
            except Exception as e:
                raise ValueError(f"Error processing line {i}: {e}")
    labels = clean_labels(labels)
    return data, labels

def get_krapivin_data(file_path="/kaggle/input/keypharses-extraction-dataset/data/krapivin/krapivin_test.json"):
    return get_short_data(file_path)

def get_nus_data(file_path="/kaggle/input/keypharses-extraction-dataset/data/nus/nus_test.json"):
    return get_long_data(file_path)

def get_semeval2010_data(file_path="/kaggle/input/keypharses-extraction-dataset/data/SemEval2010/semeval_test.json"):
    return get_short_data(file_path)

def get_dataset_data(dataset_name):
    if dataset_name == "duc2001":
        return get_duc2001_data()
    elif dataset_name == "inspec":
        return get_inspec_data()
    elif dataset_name == "krapivin":
        return get_krapivin_data()
    elif dataset_name == "nus":
        return get_nus_data()
    elif dataset_name == "semeval2010":
        return get_semeval2010_data()
    elif dataset_name == "sameval2017":
        return get_semeval2017_data()
    else:
        raise ValueError("Dataset name not recognized.")

In [5]:

def calculate_f1(predicted, ground_truth, k) -> float:
    """
    Calculate F1@K.
    
    Parameters:
      predicted (list): List of predicted keyphrases.
      ground_truth (list): List of ground truth keyphrases.
      k (int): The cutoff for evaluation.
    
    Returns:
      float: F1 score (scaled to percentage if desired).
    """
    predicted_top_k = predicted[:k]
    common = set(predicted_top_k) & set(ground_truth)
    precision = len(common) * 1.0 / k if k > 0 else 0
    recall = len(common) * 1.0 / len(ground_truth) if ground_truth else 0
    f1 = 0
    if precision + recall > 0:
       f1 = 200.0 * precision * recall / (precision + recall)  # Multiply by 100*2 to get percentage
    return f1

def print_to_json(data_name, k, score):
    """
    Print the evaluation results to a JSON file.
    """
    average_score = sum(score) / len(score) if score else 0
    result = {
        "dataset": data_name,
        "top_k": k,
        "average_score": average_score,
    }
    # Ensure the output directory exists; here, we create a folder "working" inside the current directory.
    os.makedirs("/kaggle/working/mderank", exist_ok=True)
    print("---> f1@k_score: ", result)
    with open(f"/kaggle/working/mderank/{data_name}_{k}.json", "w") as outfile:
        json.dump(result, outfile)

In [6]:
# Main execution block
if __name__ == "__main__":
    # List the datasets you want to process. 
    dataset = [ 'semeval2010']
    
    for data_name in dataset:
        print(data_name)
        data, labels = get_dataset_data(data_name) 
        score5 = []
        score10 = []
        score15 = []
        for id in data:
            keyphrases = extract_keyphrases(data[id], top_k=15)
            
            f1_5 = calculate_f1(keyphrases, labels[id], 5)
            f1_10 = calculate_f1(keyphrases, labels[id], 10)
            f1_15 = calculate_f1(keyphrases, labels[id], 15)
            score5.append(f1_5)
            score10.append(f1_10)
            score15.append(f1_15)
            
            print("**** ", id, " ---> ", f1_5, ' ', f1_10, ' ', f1_15)
            if (f1_5 == 0): 
                print(data[id])
                print("++++++ label: ", labels[id])
                print("++++++ model:", keyphrases)
        print_to_json(data_name, 5, score5)
        print_to_json(data_name, 10, score10)
        print_to_json(data_name, 15, score15)

semeval2010


Loading Doc ...: 100it [00:00, 8102.59it/s]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

****  0  --->  8.333333333333332   20.689655172413794   17.647058823529413
****  1  --->  21.052631578947363   25.0   34.48275862068965
****  2  --->  28.571428571428573   23.076923076923077   19.35483870967742
****  3  --->  10.526315789473681   16.666666666666664   13.793103448275861
****  4  --->  0   0   0
In this paper we compare two approaches to the design of protocol frameworks - tools for implementing modular network protocols . The most common approach uses events as the main abstraction for a local interaction between protocol modules . We argue that an alternative approach , that is based on service abstraction , is more suitable for expressing modular protocols . It also facilitates advanced features in the design of protocols , such as dynamic update of distributed protocols . We then describe an experimental implementation of a service-based protocol framework in Java.
++++++ label:  ['communication', 'request', 'modularity', 'dynamic protocol replacement', 'service inte