In [10]:
import torch
print(f'CUDA is available? {torch.cuda.is_available()}')

import random
random.seed(42)
import string
from collections import defaultdict
from pathlib import Path
project_dir = Path('/home/jovyan/active-projects/keyword-extraction')

import spacy
from spacy.tokens import Doc, DocBin
nlp = spacy.blank('en')

for extension in ['section_url', 'subsection']:
    if not Doc.has_extension(extension):
        Doc.set_extension(extension, default=None)

import pandas as pd
import numpy as np

docs = list(DocBin()
            .from_disk(project_dir / 'data' / 'openstax-subsections.spacy')
            .get_docs(nlp.vocab))

CUDA is available? True


In [96]:
from transformers import (
    AutoModelForTokenClassification,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Text2TextGenerationPipeline,
    TokenClassificationPipeline,
)

from transformers.pipelines import AggregationStrategy

class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyphrase_sep_token=';', *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    truncation=True,
                                                    max_length=512,
                                                    model_max_length=512,
                                                    ),
            *args,
            **kwargs
        )
        self.keyphrase_sep_token = keyphrase_sep_token

    def postprocess(self, model_outputs):
        results = super().postprocess(model_outputs=model_outputs)
        return [
            [
                keyphrase.strip().translate(str.maketrans('', '', string.punctuation))
                for keyphrase in result.get('generated_text').split(
                    self.keyphrase_sep_token
                )
                if keyphrase.translate(str.maketrans('', '', string.punctuation)) != ''
            ]
            for result in results
        ][0]


class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    truncation=True,
                                                    max_length=512,
                                                    model_max_length=512,
                                                    ),
            *args,
            # **{'model_max_length': 510}
            **kwargs
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE
            if self.model.config.model_type == 'roberta'
            else AggregationStrategy.FIRST,
        )
        return np.unique([result.get('word').strip() for result in results])

In [97]:
model_dict = {
    'spacy': [
        '/home/jovyan/active-projects/keyword-extraction/training/model-best',
    ],
    'extraction': [
        'ml6team/keyphrase-extraction-kbir-inspec',
        # 'ml6team/keyphrase-extraction-distilbert-inspec',
        'ml6team/keyphrase-extraction-kbir-openkp',
        # 'ml6team/keyphrase-extraction-distilbert-openkp',
        'ml6team/keyphrase-extraction-kbir-kptimes',
        # 'ml6team/keyphrase-extraction-distilbert-kptimes',
        'ml6team/keyphrase-extraction-kbir-semeval2017',
        'ml6team/keyphrase-extraction-kbir-kpcrowd',
    ],
    'generation': [
        'ml6team/keyphrase-generation-keybart-inspec',
        'ml6team/keyphrase-generation-t5-small-inspec',
        'ml6team/keyphrase-generation-t5-small-openkp',
        'bloomberg/KeyBART',
    ]
}

samples = random.sample(docs, 100)

In [98]:
samples[0]._.section_url

'https://openstax.org/books/introduction-anthropology/pages/5-3-the-emergence-of-us-the-archaic-homo'

In [101]:
def keyphrase_pipe(samples, pipe_type, model_name):
    results = []
    
    if pipe_type == 'extraction':
        pipe = KeyphraseExtractionPipeline(model=model_name,
                                           truncation=True)
        
    elif pipe_type == 'generation':
        pipe = KeyphraseGenerationPipeline(model=model_name,
                                           truncation=True)
    elif pipe_type == 'spacy':
        nlp = spacy.load(model_name)
        pipe = lambda sample: [e.text for e in nlp(sample).ents]

    for sample in samples:
        # print(f'{model_name} - {sample._.section_url} - {sample._.subsection}')
        keywords = pipe(sample.text)
        
        # print(keywords)
        results.append('; '.join(keywords))
        
    return results
    


def compare_models(samples, model_dict):
    df_dict = defaultdict(list)
    df_dict['text'] = [sample.text for sample in samples]
    df_dict['url'] = [sample._.section_url for sample in samples]
    
    for pipe_type, models in model_dict.items():
        # if pipe_type != 'generation':
        #     print('Continue')
        #     continue
        for model_name in models:
            print(f'{model_name}')
            df_dict[model_name] = keyphrase_pipe(samples, pipe_type, model_name)
            
    return df_dict


In [102]:
results = compare_models(samples, model_dict)

/home/jovyan/active-projects/keyword-extraction/training/model-best


Token indices sequence length is longer than the specified maximum sequence length for this model (4235 > 4096). Running this sequence through the model will result in indexing errors


ml6team/keyphrase-extraction-kbir-inspec
ml6team/keyphrase-extraction-kbir-openkp
ml6team/keyphrase-extraction-kbir-kptimes
ml6team/keyphrase-extraction-kbir-semeval2017
ml6team/keyphrase-extraction-kbir-kpcrowd
ml6team/keyphrase-generation-keybart-inspec
ml6team/keyphrase-generation-t5-small-inspec
ml6team/keyphrase-generation-t5-small-openkp
bloomberg/KeyBART


In [104]:
pd.DataFrame(results).to_csv('../results/10-models-100-subsections.csv', index=False)

In [112]:
results_df = pd.DataFrame.from_dict(results, columns=['Keyphrases'], orient='index').reset_index(names='Model Name')
results_df

Unnamed: 0,Model Name,Keyphrases
0,/home/jovyan/active-projects/keyword-extractio...,Ethics; ethical issue
1,ml6team/keyphrase-extraction-kbir-inspec,HIV patients; Hurricane Katrina; Latin America...
2,ml6team/keyphrase-extraction-distilbert-inspec,business ethics; drug; ethical behavior; ethic...
3,ml6team/keyphrase-extraction-kbir-openkp,ethical standards
4,ml6team/keyphrase-extraction-distilbert-openkp,personal ethical standards; shkreli
5,ml6team/keyphrase-extraction-kbir-kptimes,Ethics; ethics
6,ml6team/keyphrase-extraction-distilbert-kptimes,ethics
7,ml6team/keyphrase-extraction-kbir-semeval2017,Bribes; Ethics; a set of; bribes.; broke into ...
8,ml6team/keyphrase-extraction-kbir-kpcrowd,CEO; Ethics; HIV patients; Hurricane; New Orle...
9,ml6team/keyphrase-generation-keybart-inspec,personal ethical standards; moral standards; b...


In [113]:
results_df.to_csv('../results/keyterms_by_model.csv', index=False)