In [1]:
import torch
print(torch.cuda.is_available())

import random
random.seed(42)
import string
from collections import defaultdict
from pathlib import Path
project_dir = Path('/home/jovyan/active-projects/keyword-extraction')

import spacy
from spacy.tokens import Doc, DocBin
nlp = spacy.blank('en')

for extension in ['section_url', 'subsection']:
    if not Doc.has_extension(extension):
        Doc.set_extension(extension, default=None)

import pandas as pd
import numpy as np

docs = list(DocBin()
            .from_disk(project_dir / 'data' / 'openstax-subsections.spacy')
            .get_docs(nlp.vocab))

True


In [96]:
from transformers import (
    AutoModelForTokenClassification,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Text2TextGenerationPipeline,
    TokenClassificationPipeline,
)

from transformers.pipelines import AggregationStrategy

class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyphrase_sep_token=';', *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    truncation=True,
                                                    max_length=512,
                                                    model_max_length=512,
                                                    ),
            *args,
            **kwargs
        )
        self.keyphrase_sep_token = keyphrase_sep_token

    def postprocess(self, model_outputs):
        results = super().postprocess(model_outputs=model_outputs)
        return [
            [
                keyphrase.strip().translate(str.maketrans('', '', string.punctuation))
                for keyphrase in result.get('generated_text').split(
                    self.keyphrase_sep_token
                )
                if keyphrase.translate(str.maketrans('', '', string.punctuation)) != ''
            ]
            for result in results
        ][0]


class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    truncation=True,
                                                    max_length=512,
                                                    model_max_length=512,
                                                    ),
            *args,
            # **{'model_max_length': 510}
            **kwargs
        )

    def postprocess(self, model_outputs):
        results = super().postprocess(
            model_outputs=model_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE
            if self.model.config.model_type == 'roberta'
            else AggregationStrategy.FIRST,
        )
        return np.unique([result.get('word').strip() for result in results])

In [97]:
model_dict = {
    'spacy': [
        '/home/jovyan/active-projects/keyword-extraction/training/model-best',
    ],
    'extraction': [
        'ml6team/keyphrase-extraction-kbir-inspec',
        # 'ml6team/keyphrase-extraction-distilbert-inspec',
        'ml6team/keyphrase-extraction-kbir-openkp',
        # 'ml6team/keyphrase-extraction-distilbert-openkp',
        'ml6team/keyphrase-extraction-kbir-kptimes',
        # 'ml6team/keyphrase-extraction-distilbert-kptimes',
        'ml6team/keyphrase-extraction-kbir-semeval2017',
        'ml6team/keyphrase-extraction-kbir-kpcrowd',
    ],
    'generation': [
        'ml6team/keyphrase-generation-keybart-inspec',
        'ml6team/keyphrase-generation-t5-small-inspec',
        'ml6team/keyphrase-generation-t5-small-openkp',
        'bloomberg/KeyBART',
    ]
}

samples = random.sample(docs, 100)

In [98]:
samples[0]._.section_url

'https://openstax.org/books/introduction-anthropology/pages/5-3-the-emergence-of-us-the-archaic-homo'

In [101]:
def keyphrase_pipe(samples, pipe_type, model_name):
    results = []
    
    if pipe_type == 'extraction':
        pipe = KeyphraseExtractionPipeline(model=model_name,
                                           truncation=True)
        
    elif pipe_type == 'generation':
        pipe = KeyphraseGenerationPipeline(model=model_name,
                                           truncation=True)
    elif pipe_type == 'spacy':
        nlp = spacy.load(model_name)
        pipe = lambda sample: [e.text for e in nlp(sample).ents]

    for sample in samples:
        # print(f'{model_name} - {sample._.section_url} - {sample._.subsection}')
        keywords = pipe(sample.text)
        
        # print(keywords)
        results.append('; '.join(keywords))
        
    return results
    


def compare_models(samples, model_dict):
    df_dict = defaultdict(list)
    df_dict['text'] = [sample.text for sample in samples]
    df_dict['url'] = [sample._.section_url for sample in samples]
    
    for pipe_type, models in model_dict.items():
        # if pipe_type != 'generation':
        #     print('Continue')
        #     continue
        for model_name in models:
            print(f'{model_name}')
            df_dict[model_name] = keyphrase_pipe(samples, pipe_type, model_name)
            
    return df_dict


In [102]:
results = compare_models(samples, model_dict)

/home/jovyan/active-projects/keyword-extraction/training/model-best


Token indices sequence length is longer than the specified maximum sequence length for this model (4235 > 4096). Running this sequence through the model will result in indexing errors


ml6team/keyphrase-extraction-kbir-inspec
ml6team/keyphrase-extraction-kbir-openkp
ml6team/keyphrase-extraction-kbir-kptimes
ml6team/keyphrase-extraction-kbir-semeval2017
ml6team/keyphrase-extraction-kbir-kpcrowd
ml6team/keyphrase-generation-keybart-inspec
ml6team/keyphrase-generation-t5-small-inspec
ml6team/keyphrase-generation-t5-small-openkp
bloomberg/KeyBART


In [104]:
pd.DataFrame(results).to_csv('../results/10-models-100-subsections.csv', index=False)

## Prepare Dataframe

In [9]:
df = pd.read_csv('../results/10-models-100-subsections.csv')
original_columns = df.columns
df['num_lists'] = df.apply(lambda row: row[2: ].count(), axis=1) 
df.head(2)

Unnamed: 0,text,url,/home/jovyan/active-projects/keyword-extraction/training/model-best,ml6team/keyphrase-extraction-kbir-inspec,ml6team/keyphrase-extraction-kbir-openkp,ml6team/keyphrase-extraction-kbir-kptimes,ml6team/keyphrase-extraction-kbir-semeval2017,ml6team/keyphrase-extraction-kbir-kpcrowd,ml6team/keyphrase-generation-keybart-inspec,ml6team/keyphrase-generation-t5-small-inspec,ml6team/keyphrase-generation-t5-small-openkp,bloomberg/KeyBART,num_lists
0,Late Archaic Homo Homo naledi,https://openstax.org/books/introduction-anthro...,,,Homo,,Homo Homo naledi,Archaic; Homo naledi; Late,late archaic; Homo Homo Homo naledi; late arch...,late archaic homo naledi,archaic homo,late Archaic Homo Homo naledi Late Archaic H...,7
1,Requirements for Momentum Conservation There i...,https://openstax.org/books/university-physics-...,closed system; Law of Conservation of Momentum,Law of Conservation of Momentum; Momentum Cons...,Momentum Conservation,physics,(; (gravity; Law of Conservation of Momentum; ...,Conservation; Momentum; complication; conserve...,momentum conservation; external force; interna...,law of conservation of momentum; system; inter...,requirements; momentum conservation,galactic cluster; closed system; momentum cons...,10


### Shuffle

In [193]:
def permute_row(row):
    frozen_columns = [0, 1, 12]
    new_index = ['text',
                 'url',
                 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                 'num_lists',
                 'key',
                ]
    shuffled_cols = np.insert(frozen_columns, 2, np.random.permutation(range(2,12)))
    key_col = pd.Series([shuffled_cols], name='key')
    new_row = pd.concat([row[shuffled_cols], key_col])
    return new_row.set_axis(new_index)

shuffled_df = df.apply(permute_row, axis=1)
shuffled_df.head(2)

Unnamed: 0,text,url,0,1,2,3,4,5,6,7,8,9,num_lists,key
0,Late Archaic Homo Homo naledi,https://openstax.org/books/introduction-anthro...,late archaic; Homo Homo Homo naledi; late arch...,Homo Homo naledi,late Archaic Homo Homo naledi Late Archaic H...,,archaic homo,,Homo,Archaic; Homo naledi; Late,late archaic homo naledi,,7,"[0, 1, 8, 6, 11, 3, 10, 2, 4, 7, 9, 5, 12]"
1,Requirements for Momentum Conservation There i...,https://openstax.org/books/university-physics-...,requirements; momentum conservation,closed system; Law of Conservation of Momentum,Momentum Conservation,(; (gravity; Law of Conservation of Momentum; ...,Conservation; Momentum; complication; conserve...,Law of Conservation of Momentum; Momentum Cons...,law of conservation of momentum; system; inter...,galactic cluster; closed system; momentum cons...,momentum conservation; external force; interna...,physics,10,"[0, 1, 10, 2, 4, 6, 7, 3, 9, 11, 8, 5, 12]"


## Function to Unshuffle with Keys

In [192]:
def unshuffle_row(row):
    return row[:-1].set_axis(row.key).sort_index()
    
shuffled_df.apply(unshuffle_row, axis=1).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,Late Archaic Homo Homo naledi,https://openstax.org/books/introduction-anthro...,,,Homo,,Homo Homo naledi,Archaic; Homo naledi; Late,late archaic; Homo Homo Homo naledi; late arch...,late archaic homo naledi,archaic homo,late Archaic Homo Homo naledi Late Archaic H...,7
1,Requirements for Momentum Conservation There i...,https://openstax.org/books/university-physics-...,closed system; Law of Conservation of Momentum,Law of Conservation of Momentum; Momentum Cons...,Momentum Conservation,physics,(; (gravity; Law of Conservation of Momentum; ...,Conservation; Momentum; complication; conserve...,momentum conservation; external force; interna...,law of conservation of momentum; system; inter...,requirements; momentum conservation,galactic cluster; closed system; momentum cons...,10


### Add rows for scoring
Every other row will be used to record scores.

In [194]:
rank_rows = (shuffled_df
             .replace(to_replace=r'.*', value=int(0), regex=True)
             .assign(text = 'Ranks:',
                     url = ''
                    )
             .rename(lambda x: x + .5)
            )
new_df = pd.concat([shuffled_df, rank_rows], sort=False).sort_index().reset_index(drop=True)
new_df.head(2)

Unnamed: 0,text,url,0,1,2,3,4,5,6,7,8,9,num_lists,key
0,Late Archaic Homo Homo naledi,https://openstax.org/books/introduction-anthro...,late archaic; Homo Homo Homo naledi; late arch...,Homo Homo naledi,late Archaic Homo Homo naledi Late Archaic H...,,archaic homo,,Homo,Archaic; Homo naledi; Late,late archaic homo naledi,,7,"[0, 1, 8, 6, 11, 3, 10, 2, 4, 7, 9, 5, 12]"
1,Ranks:,,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,,7,"[0, 1, 8, 6, 11, 3, 10, 2, 4, 7, 9, 5, 12]"


In [195]:
new_df.to_csv('../data/10-models-100-subsections-shuffled.csv', index=False)

## Analyze ratings

In [31]:
df = pd.read_csv('../data/10-models-100-subsections-shuffled-lydia.csv',
                 converters={'key': lambda x: [int(i) for i in x.strip('[]').split()]}).drop(columns='validation')

with pd.option_context('display.max_colwidth', None):
    display(df.head(2))

def unshuffle_row(row):
    return row[:-1].set_axis(row.key[:-1]).sort_index()

df = df.apply(unshuffle_row, axis=1)
df.columns = original_columns
df.head(2)

Unnamed: 0,text,url,0,1,2,3,4,5,6,7,8,9,key
0,Late Archaic Homo Homo naledi,https://openstax.org/books/introduction-anthropology/pages/5-3-the-emergence-of-us-the-archaic-homo,late archaic; Homo Homo Homo naledi; late archaic species,Homo Homo naledi,late Archaic Homo Homo naledi Late Archaic Homo Homo naled,,archaic homo,,Homo,Archaic; Homo naledi; Late,late archaic homo naledi,,"[0, 1, 8, 6, 11, 3, 10, 2, 4, 7, 9, 5, 12]"
1,Ranks:,,0,0,0,,0,,0,0,0,,"[0, 1, 8, 6, 11, 3, 10, 2, 4, 7, 9, 5, 12]"


Unnamed: 0,text,url,/home/jovyan/active-projects/keyword-extraction/training/model-best,ml6team/keyphrase-extraction-kbir-inspec,ml6team/keyphrase-extraction-kbir-openkp,ml6team/keyphrase-extraction-kbir-kptimes,ml6team/keyphrase-extraction-kbir-semeval2017,ml6team/keyphrase-extraction-kbir-kpcrowd,ml6team/keyphrase-generation-keybart-inspec,ml6team/keyphrase-generation-t5-small-inspec,ml6team/keyphrase-generation-t5-small-openkp,bloomberg/KeyBART
0,Late Archaic Homo Homo naledi,https://openstax.org/books/introduction-anthropology/pages/5-3-the-emergence-of-us-the-archaic-homo,,,Homo,,Homo Homo naledi,Archaic; Homo naledi; Late,late archaic; Homo Homo Homo naledi; late archaic species,late archaic homo naledi,archaic homo,late Archaic Homo Homo naledi Late Archaic Homo Homo naled
1,Ranks:,,,,0,,0,0,0,0,0,0


In [47]:
ranks = df.iloc[:,2:].loc[df.text=='Ranks:'].astype(pd.Int64Dtype())
ranks.apply(pd.value_counts, axis=0)

Unnamed: 0,/home/jovyan/active-projects/keyword-extraction/training/model-best,ml6team/keyphrase-extraction-kbir-inspec,ml6team/keyphrase-extraction-kbir-openkp,ml6team/keyphrase-extraction-kbir-kptimes,ml6team/keyphrase-extraction-kbir-semeval2017,ml6team/keyphrase-extraction-kbir-kpcrowd,ml6team/keyphrase-generation-keybart-inspec,ml6team/keyphrase-generation-t5-small-inspec,ml6team/keyphrase-generation-t5-small-openkp,bloomberg/KeyBART
0,28,56,51,17.0,58.0,62.0,58.0,62.0,58.0,58
1,9,3,4,2.0,1.0,3.0,7.0,4.0,10.0,5
2,8,4,3,8.0,7.0,2.0,7.0,3.0,2.0,4
3,4,5,7,7.0,1.0,5.0,4.0,4.0,6.0,5
4,3,4,3,6.0,4.0,7.0,4.0,3.0,7.0,8
5,2,4,5,6.0,9.0,3.0,6.0,4.0,1.0,7
6,2,4,8,4.0,8.0,7.0,2.0,7.0,5.0,1
7,4,5,4,4.0,7.0,3.0,5.0,6.0,4.0,5
8,5,5,2,3.0,1.0,4.0,6.0,5.0,4.0,4
9,1,3,1,3.0,4.0,2.0,,2.0,3.0,2
