In [10]:
import torch
print(f'CUDA is available? {torch.cuda.is_available()}')

import random
random.seed(42)
import string
from collections import defaultdict
from pathlib import Path
project_dir = Path('/home/jovyan/active-projects/keyword-extraction')

import spacy
from spacy.tokens import Doc, DocBin
nlp = spacy.blank('en')

for extension in ['section_url', 'subsection']:
    if not Doc.has_extension(extension):
        Doc.set_extension(extension, default=None)

import pandas as pd
import numpy as np

docs = list(DocBin()
            .from_disk(project_dir / 'data' / 'openstax-subsections.spacy')
            .get_docs(nlp.vocab))

CUDA is available? True


In [21]:
from transformers import (
    AutoModelForTokenClassification,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Text2TextGenerationPipeline,
    TokenClassificationPipeline,
)

from transformers.pipelines import AggregationStrategy

class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyphrase_sep_token=';', *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    truncation=True,
                                                    max_length=510,
                                                    ),
            *args,
            **kwargs
        )
        self.keyphrase_sep_token = keyphrase_sep_token

    def postprocess(self, model_outputs):
        results = super().postprocess(model_outputs=model_outputs)
        return [
            [
                keyphrase.strip().translate(str.maketrans('', '', string.punctuation))
                for keyphrase in result.get('generated_text').split(
                    self.keyphrase_sep_token
                )
                if keyphrase.translate(str.maketrans('', '', string.punctuation)) != ''
            ]
            for result in results
        ][0]


class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    truncation=True,
                                                    max_length=510,
                                                    model_max_length=512,
                                                    ),
            *args,
            # **{'model_max_length': 510}
            **kwargs
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE
            if self.model.config.model_type == 'roberta'
            else AggregationStrategy.FIRST,
        )
        return np.unique([result.get('word').strip() for result in results])

In [22]:
model_dict = {
    'spacy': [
        '/home/jovyan/active-projects/keyword-extraction/training/model-best',
    ],
    'extraction': [
        'ml6team/keyphrase-extraction-kbir-inspec',
        'ml6team/keyphrase-extraction-distilbert-inspec',
        'ml6team/keyphrase-extraction-kbir-openkp',
        'ml6team/keyphrase-extraction-distilbert-openkp',
        'ml6team/keyphrase-extraction-kbir-kptimes',
        'ml6team/keyphrase-extraction-distilbert-kptimes',
        'ml6team/keyphrase-extraction-kbir-semeval2017',
        'ml6team/keyphrase-extraction-kbir-kpcrowd',
    ],
    'generation': [
        'ml6team/keyphrase-generation-keybart-inspec',
        'ml6team/keyphrase-generation-t5-small-inspec',
        'ml6team/keyphrase-generation-t5-small-openkp',
        'bloomberg/KeyBART',
    ]
}

samples = random.sample(docs, 100)

In [23]:
samples[0]._.section_url

'https://openstax.org/books/organizational-behavior/pages/2-3-personality-an-introduction'

In [28]:
def keyphrase_pipe(samples, pipe_type, model_name):
    results = []
    
    if pipe_type == 'extraction':
        pipe = KeyphraseExtractionPipeline(model=model_name)
        
    elif pipe_type == 'generation':
        pipe = KeyphraseGenerationPipeline(model=model_name,
                                          )
                                           # truncation=True)
    elif pipe_type == 'spacy':
        nlp = spacy.load(model_name)
        pipe = lambda sample: [e.text for e in nlp(sample).ents]

    for sample in tqdm(samples):
        # print(f'{model_name} - {sample._.section_url} - {sample._.subsection}')
        keywords = pipe(sample.text)
        
        # print(keywords)
        results.append('; '.join(keywords))
        
    return results
    


def compare_models(samples, model_dict):
    df_dict = defaultdict(list)
    df_dict['text'] = [sample.text for sample in samples]
    df_dict['url'] = [sample._.section_url for sample in samples]
    
    for pipe_type, models in model_dict.items():
        for model_name in models:
            print(f'{model_name}')
            df_dict[model_name] = keyphrase_pipe(samples, pipe_type, model_name)
            
    return df_dict


In [29]:
results = compare_models(samples, model_dict)

/home/jovyan/active-projects/keyword-extraction/training/model-best
ml6team/keyphrase-extraction-kbir-inspec
ml6team/keyphrase-extraction-distilbert-inspec


TypeError: 'NoneType' object is not iterable

In [26]:
pd.DataFrame(results)

Unnamed: 0,text,url,/home/jovyan/active-projects/keyword-extraction/training/model-best,ml6team/keyphrase-extraction-kbir-inspec,ml6team/keyphrase-extraction-distilbert-inspec,ml6team/keyphrase-extraction-kbir-openkp,ml6team/keyphrase-extraction-distilbert-openkp,ml6team/keyphrase-extraction-kbir-kptimes,ml6team/keyphrase-extraction-distilbert-kptimes,ml6team/keyphrase-extraction-kbir-semeval2017,ml6team/keyphrase-extraction-kbir-kpcrowd,ml6team/keyphrase-generation-keybart-inspec,ml6team/keyphrase-generation-t5-small-inspec,ml6team/keyphrase-generation-t5-small-openkp,bloomberg/KeyBART
0,Definition of Personality Personality can be d...,https://openstax.org/books/organizational-beha...,,communalities; organizational analysis; psycho...,interacting characteristics; organizational an...,Definition; Personality; definition; personality,definition; personality,,,",; Personality; actions); constellation of int...",Definition; Personality; Salvatore Maddi; atte...,personality personality; organizational analys...,personality personality; organizational analys...,personality,organizational behavior; organizational analys...
1,Learning Objectives 6.9.1 Apply the formulas f...,https://openstax.org/books/calculus-volume-1/p...,,catenary curve; derivatives; differentiation; ...,catenary curve; derivatives; differentiation; ...,hyperbolic functions,hyperbolic functions,,,"Functions and Graphs ,; catenary curve.; diffe...",Apply; Functions; Introduction; Learning Objec...,learning objectives; hyperbolic functions; int...,learning objectives; differentiation; integrat...,learning objectives,hyperbolic function; catenary curve; inverse h...
2,Final Comparison of the Four Capital Budgeting...,https://openstax.org/books/principles-manageri...,,Barclays; Capital Budgeting Options; LIBOR; LI...,accounting rate; accounting rate of return; ba...,Capital Budgeting Options,capital budgeting options,Barclays; LIBOR,"barclays; barclays,; libor; volkswagen","Analyzing these opportunities,; Capital Budget...",Analyzing; Barclays; Budgeting; CNN; Capital; ...,capital budgeting options; payback method; acc...,capital budgeting options; barclays; payback m...,capital budgeting options; libor scandal; lond...,profitability; financial services; profitabili...
3,Contraception and Birth Control The prevention...,https://openstax.org/books/biology-ap-courses/...,contraception,Barrier methods; Birth Control; Contraceptive ...,birth control; bulbourethral; cervical cap; ce...,Birth Control; Contraception; birth control; c...,birth control; contraception,,birth control; birth control.; pregnancy; sper...,"Barrier methods,; Birth Control; Contraception...",Birth Control; Combinations; Contraception; Co...,contraception; birth control; natural family p...,contraception and birth control; sperm and egg...,contraception; birth control; sperm,family planning; contraceptive methodsside eff...
4,Collect the Data Go to your local supermarket....,https://openstax.org/books/statistics/pages/11...,,cash; express lane; grocery receipts; supermarket,3 cashiers; 30 people; grocery receipts; local...,,,,,.; 30; Calculate the following; Collect the Da...,Ask; Collect; Data; amounts; ask; cashiers; ca...,grocery receipts; cashiers; express lane; expe...,data collection; local supermarket; ask 30 peo...,collect the data,expected value; grocery receipt; express lane;...


In [111]:
results = {}

for pipe_type, models in model_dict.items():
    for model_name in models:
        if pipe_type == 'extraction':
            pipe = KeyphraseExtractionPipeline(model=model_name)
        elif pipe_type == 'generation':
            pipe = KeyphraseGenerationPipeline(model=model_name,
                                              )
                                               # truncation=True)
        elif pipe_type == 'spacy':
            nlp = spacy.load(model_name)
            pipe = lambda sample: [e.text for e in nlp(sample).ents]
            
        for sample in samples:
            # print(f'{model_name} - {sample._.section_url} - {sample._.subsection}')
            keywords = pipe(sample.text)
            print(f'{model_name}')
            print(keywords)
            print()
            results[model_name] = '; '.join(keywords)

/home/jovyan/active-projects/keyword-extraction/training/model-best
['Ethics', 'ethical issue']

ml6team/keyphrase-extraction-kbir-inspec
['HIV patients' 'Hurricane Katrina' 'Latin American market' 'New Orleans'
 'Turing Pharmaceuticals' 'bribes' 'business ethics' 'moral standards'
 'personal ethical standards' 'plastics manufacturer']

ml6team/keyphrase-extraction-distilbert-inspec
['business ethics' 'drug' 'ethical behavior' 'ethical issue'
 'ethical standards' 'flooded stores' 'hiv patients' 'hurricane katrina'
 'martin' 'moral standards' 'plastics manufacturer' 'shkreli'
 'turing pharmaceuticals']



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ml6team/keyphrase-extraction-kbir-openkp
['ethical standards']

ml6team/keyphrase-extraction-distilbert-openkp
['personal ethical standards' 'shkreli']



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ml6team/keyphrase-extraction-kbir-kptimes
['Ethics' 'ethics']

ml6team/keyphrase-extraction-distilbert-kptimes
['ethics']



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ml6team/keyphrase-extraction-kbir-semeval2017
['Bribes' 'Ethics' 'a set of' 'bribes.' 'broke into flooded stores,'
 'business ethics' 'ethical behavior.' 'ethical issue' 'ethical issue .'
 'ethical or unethical' 'ethical standards?' 'food and bottled water'
 'increase' 'moral standards' 'paying bribes' 'recognize an'
 'unethical behavior?' 'unethical situations.']

ml6team/keyphrase-extraction-kbir-kpcrowd
['CEO' 'Ethics' 'HIV patients' 'Hurricane' 'New Orleans' 'Pharmaceuticals'
 'Shkreli' 'Turing' 'behavior' 'bottled' 'bribes' 'business' 'choose'
 'consider' 'culture' 'defending' 'drug' 'employed' 'ethical' 'ethics'
 'firm' 'flooded' 'food' 'guidance' 'judging' 'manufacturer'
 'moral standards' 'newborns' 'paying' 'philosophies' 'president' 'price'
 'raised' 'receive' 'recognize' 'recognizing' 'refused' 'situation'
 'specialized' 'standards' 'stranded' 'understanding' 'unethical']

ml6team/keyphrase-generation-keybart-inspec
['personal ethical standards', 'moral standards', 'business

In [112]:
results_df = pd.DataFrame.from_dict(results, columns=['Keyphrases'], orient='index').reset_index(names='Model Name')
results_df

Unnamed: 0,Model Name,Keyphrases
0,/home/jovyan/active-projects/keyword-extractio...,Ethics; ethical issue
1,ml6team/keyphrase-extraction-kbir-inspec,HIV patients; Hurricane Katrina; Latin America...
2,ml6team/keyphrase-extraction-distilbert-inspec,business ethics; drug; ethical behavior; ethic...
3,ml6team/keyphrase-extraction-kbir-openkp,ethical standards
4,ml6team/keyphrase-extraction-distilbert-openkp,personal ethical standards; shkreli
5,ml6team/keyphrase-extraction-kbir-kptimes,Ethics; ethics
6,ml6team/keyphrase-extraction-distilbert-kptimes,ethics
7,ml6team/keyphrase-extraction-kbir-semeval2017,Bribes; Ethics; a set of; bribes.; broke into ...
8,ml6team/keyphrase-extraction-kbir-kpcrowd,CEO; Ethics; HIV patients; Hurricane; New Orle...
9,ml6team/keyphrase-generation-keybart-inspec,personal ethical standards; moral standards; b...


In [113]:
results_df.to_csv('../results/keyterms_by_model.csv', index=False)