In [48]:
import torch
print(f'CUDA is available? {torch.cuda.is_available()}')

import random
random.seed(42)
import string
from pathlib import Path
project_dir = Path('/home/jovyan/active-projects/keyword-extraction')

import spacy
from spacy.tokens import Doc, DocBin
nlp = spacy.blank('en')

for extension in ['section_url', 'subsection']:
    if not Doc.has_extension(extension):
        Doc.set_extension(extension, default=None)

import numpy as np


db = DocBin().from_disk(project_dir / 'data' / 'openstax-subsections.spacy')

docs = list(db.get_docs(nlp.vocab))

CUDA is available? True


In [72]:
from transformers import (
    AutoModelForTokenClassification,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Text2TextGenerationPipeline,
    TokenClassificationPipeline,
)

from transformers.pipelines import AggregationStrategy

class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyphrase_sep_token=';', *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    truncation=True,
                                                    max_length=256,
                                                    ),
            *args,
            **kwargs
        )
        self.keyphrase_sep_token = keyphrase_sep_token

    def postprocess(self, model_outputs):
        results = super().postprocess(model_outputs=model_outputs)
        return [
            [
                keyphrase.strip().translate(str.maketrans('', '', string.punctuation))
                for keyphrase in result.get('generated_text').split(
                    self.keyphrase_sep_token
                )
                if keyphrase.translate(str.maketrans('', '', string.punctuation)) != ''
            ]
            for result in results
        ][0]


class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    truncation=True,
                                                    ),
            *args,
            **{'model_max_length': 510}
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE
            if self.model.config.model_type == 'roberta'
            else AggregationStrategy.FIRST,
        )
        return np.unique([result.get('word').strip() for result in results])

SyntaxError: positional argument follows keyword argument (3723264391.py, line 45)

In [70]:
model_dict = {
  'extraction': [
    'ml6team/keyphrase-extraction-kbir-inspec',
    'ml6team/keyphrase-extraction-distilbert-inspec',
    # 'ml6team/keyphrase-extraction-kbir-openkp',
    'ml6team/keyphrase-extraction-distilbert-openkp',
    'ml6team/keyphrase-extraction-kbir-kptimes',
    'ml6team/keyphrase-extraction-distilbert-kptimes',
    'ml6team/keyphrase-extraction-kbir-semeval2017',
    'ml6team/keyphrase-extraction-kbir-kpcrowd',
    ],
  'generation': [
    'ml6team/keyphrase-generation-keybart-inspec',
    'ml6team/keyphrase-generation-t5-small-inspec',
    'ml6team/keyphrase-generation-t5-small-openkp',
    'bloomberg/KeyBART',
  ]
}

samples = random.sample(docs, 10)

In [71]:
for type, models in model_dict.items():
    for model_name in models:
        if type == 'extraction':
            pipe = KeyphraseExtractionPipeline(model=model_name)
        elif type == 'generation':
            pipe = KeyphraseGenerationPipeline(model=model_name,
                                               truncation=True)
        for sample in samples:
            print(f'{model_name} - {sample._.section_url} - {sample._.subsection}')
            print(pipe(sample.text))
            print()

ml6team/keyphrase-extraction-kbir-inspec - https://openstax.org/books/college-algebra-corequisite-support-2e/pages/7-7-solving-systems-with-inverses - Finding the Multiplicative Inverse of 3×3 Matrices
['Multiplicative Inverse' 'Solution Augment' 'elementary row operations'
 'identity matrix' 'inverse' 'inverse matrix' 'matrix multiplication'
 'row operations']

ml6team/keyphrase-extraction-kbir-inspec - https://openstax.org/books/college-algebra-corequisite-support-2e/pages/5-5-zeros-of-polynomial-functions - Solving Real-World Applications 
['Polynomial Equations' 'Rational Zero Theorem' 'bakery problem' 'cake'
 'polynomial equations' 'sheet cake' 'sheet cakes' 'wedding celebrations']

ml6team/keyphrase-extraction-kbir-inspec - https://openstax.org/books/chemistry-2e/pages/12-5-collision-theory - Learning Objectives
['Arrhenius equation' 'Carbon monoxide' 'Collision theory'
 'activation energy' 'automobiles' 'carbon monoxide'
 'catalytic converters' 'chemical bonds' 'chemical kinetic

Downloading (…)lve/main/config.json:   0%|          | 0.00/697 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/266M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

ml6team/keyphrase-extraction-distilbert-openkp - https://openstax.org/books/college-algebra-corequisite-support-2e/pages/7-7-solving-systems-with-inverses - Finding the Multiplicative Inverse of 3×3 Matrices
['inverse']

ml6team/keyphrase-extraction-distilbert-openkp - https://openstax.org/books/college-algebra-corequisite-support-2e/pages/5-5-zeros-of-polynomial-functions - Solving Real-World Applications 
[]

ml6team/keyphrase-extraction-distilbert-openkp - https://openstax.org/books/chemistry-2e/pages/12-5-collision-theory - Learning Objectives
['collision theory']

ml6team/keyphrase-extraction-distilbert-openkp - https://openstax.org/books/college-algebra-corequisite-support-2e/pages/6-4-graphs-of-logarithmic-functions - Graphing a Horizontal Shift of 
['graphing' 'horizontal shift']

ml6team/keyphrase-extraction-distilbert-openkp - https://openstax.org/books/biology-ap-courses/pages/35-4-aquatic-biomes - Estuaries: Where the Ocean Meets Fresh Water
['estuaries']

ml6team/keyphrase

Downloading (…)lve/main/config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ml6team/keyphrase-extraction-kbir-kptimes - https://openstax.org/books/college-algebra-corequisite-support-2e/pages/7-7-solving-systems-with-inverses - Finding the Multiplicative Inverse of 3×3 Matrices


RuntimeError: The expanded size of the tensor (1778) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 1778].  Tensor sizes: [1, 514]