# Imports

In [1]:
import spacy
import pandas as pd
from tqdm.auto import tqdm

import torch
torch.cuda.is_available()

True

# Create Docs

In [2]:
df = pd.read_csv('../data/smart_source_texts.csv')

text_ids = list(df['Section'].astype(str) + '_' + df['''Assignment ID'''].astype(str) + '_' + df['expertmodelid'].astype(str))
texts = df['''Original Text'''].to_list()
termstrings = df['E_KeyConcepts'].to_list()

In [3]:
gold_terms = [termstring.split(',') for termstring in termstrings]

# Extract Keywords with SpaCy Model

In [4]:
nlp = spacy.load('../training/model-last/')

In [None]:
doc_gen = tqdm(nlp.pipe(texts), total=len(texts))
pred_terms = [doc.ents for doc in doc_gen]

In [None]:
terms_for_eval = []

for gold, pred, doc in zip(gold_terms, pred_terms, texts):
    terms_for_eval.append(([t for t in gold], [t.text for t in pred]))

In [62]:
f = open('results/macroeconomics-2e-results.txt','w', encoding='utf-8')

for name, gold, pred in terms_for_eval:
    f.write(name + '\n\n')
    i = 0
    j = 0
    while i < len(gold) and j < len(pred):
        if gold[i] in pred:
            if gold[i] == pred[j]:
                s = f'Gold: {gold[i]: <35} Pred: {pred[j]}'
                i += 1
                j +=1
            else:
                s = ' '*42 + f'Pred: {pred[j]}'
                j += 1
        elif i < len(gold):
            s = f'Gold: {gold[i]: <35}'
            i += 1
        else:
            s = ' '*42 + f'Pred: {pred[j]}'
            j += 1
        f.write(s + '\n')
    f.write('\n')
f.close()

In [12]:
pd.DataFrame(terms_for_eval, columns=['Section', 'Gold', 'Pred']).to_csv('results/macroeconomics-2e-results.csv', index=False)

# Extract with KeyBART

In [6]:
import string

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Text2TextGenerationPipeline,
)

class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyphrase_sep_token=';', *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )
        self.keyphrase_sep_token = keyphrase_sep_token

    def _parse_and_tokenize(self, *args, truncation):
        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""
        if isinstance(args[0], list):
            if self.tokenizer.pad_token_id is None:
                raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input")
            args = ([prefix + arg for arg in args[0]],)
            padding = True

        elif isinstance(args[0], str):
            args = (prefix + args[0],)
            padding = False
        else:
            raise ValueError(
                f" `args[0]`: {args[0]} have the wrong format. The should be either of type `str` or type `list`"
            )
        
        inputs = self.tokenizer(*args,
                                padding=padding, truncation=truncation, return_tensors=self.framework)
        # This is produced by tokenizers but is an invalid generate kwargs
        if "token_type_ids" in inputs:
            del inputs["token_type_ids"]
        return inputs
    
    def postprocess(self, model_outputs):
        results = super().postprocess(model_outputs=model_outputs)
        return [
            [
                keyphrase.strip().translate(str.maketrans('', '', string.punctuation))
                for keyphrase in result.get('generated_text').split(
                    self.keyphrase_sep_token
                )
                if keyphrase.translate(str.maketrans('', '', string.punctuation)) != ''
            ]
            for result in results
        ][0]

In [7]:
pipe = KeyphraseGenerationPipeline(model='bloomberg/KeyBART')

# There are some long texts in here
print([len(text.split()) for text in texts])

preds = []
for text in tqdm(texts):
    pred_terms = set()
    for i in range(0, len(text.split()), 500):
        preds_chunk = pipe(' '.join(text.split()[i: i+500]))
        pred_terms.update(preds_chunk)
    preds.append(pred_terms)

['social network', 'facial recognition', 'public policy', 'public opinion', 'social media', 'social sciences']
['prostate cancer', 'internet', 'public good', 'public health', 'internetpublic healthpublic']
['social media', 'social networking online', 'electronic publishing', 'internet', 'social media online']
['social media', 'social networking online', 'social network', 'internet marketing', 'social media online']
['human rights', 'social media', 'social networking online', 'tourism', 'internet', 'internet']
['social network', 'social institution', 'social media', 'central intelligence agency', 'social invisibility', 'social']
['social network', 'social product', 'global web', 'political system', 'connective tissue', 'motion pictures']
['search engine', 'digital technology', 'social media', 'social graph', 'social network', 'social influence']
['new media', 'business model', 'social media', 'social activity', 'social networking online', 'social']
['social network', 'social media', 'da

In [8]:
df['pred_terms'] = preds

df

Unnamed: 0,Section,Assignment ID,expertmodelid,Original Text,E_KeyConcepts,pred_terms
0,Biology,147,422,Learning Objectives\nBy the end of this sectio...,"passive transport,transport,diffusion,cell,pla...","{membr, potassium chloride, water content, gly..."
1,Biology,149,439,Learning Objectives\nBy the end of this sectio...,"autotroph,calvin cycle,light-dependent reactio...","{biology, engineering, chemical energy, physic..."
2,Biology,151,488,Learning Objectives\nBy the end of this sectio...,"chloroplast,calvin cycle,light-dependent react...","{biology, engineering, chemical energy, physic..."
3,Biology,154,441,Learning Objectives\nBy the end of this sectio...,"dominant allele,heterozygote,allele,law,mendel...","{genetic arrangement, phenotype, expected freq..."
4,Biology,167,473,Learning Objectives\nBy the end of this sectio...,"rna polymerase,gene,elongation,mrna transcript...","{ribosomal protein, genetics, protein synthesi..."
5,Biology,168,474,Learning Objectives\nBy the end of this sectio...,"termination,translation,mrna,amino acid,initia...","{genetics, protein biosynthesis, amino acid se..."
6,Biology,179,491,Learning Objectives\nBy the end of this sectio...,"allele frequency,population,allele,effect even...","{different population, genetics, genetic struc..."
7,English,148,424,Opinion: All kids should take ‘Poverty 101′\nM...,"poverty,value,beegle,teacher,job,middle class","{social psychology, socialization, human poten..."
8,English,156,467,“Get over it”\nBy Jeff Jarvis\nThursday 3 Febr...,"people,government,privacy,technology,jarvis,so...","{facial recognition, public health, social med..."
9,English,157,468,“Spring Awakening”\nHow an Egyptian Revolution...,"ghonim,khaled said,young people,facebook,revol...","{social media, social media online, electronic..."


In [10]:
df.to_csv('../results/smart_source_texts_keybart_terms.csv', index=False)

In [76]:
terms_for_eval = []

for gold, pred in zip(gold_terms, preds):
    terms_for_eval.append((gold, list(pred)))

In [90]:
f = open('../results/SMART-KeyBART.txt','w', encoding='utf-8')

for i, (gold, pred) in enumerate(terms_for_eval):
    f.write(text_ids[i] + '\n\n')
    i = 0
    j = 0
    while i < len(gold) and j < len(pred):
        if gold[i] in pred:
            if gold[i] == pred[j]:
                s = f'Gold: {gold[i]: <35} Pred: {pred[j]}'
                i += 1
                j +=1
            else:
                s = ' '*42 + f'Pred: {pred[j]}'
                j += 1
        elif i < len(gold):
            s = f'Gold: {gold[i]: <35}'
            i += 1
        else:
            s = ' '*42 + f'Pred: {pred[j]}'
            j += 1
        print(s)
        f.write(s + '\n')
    f.write('\n')
f.close()

Gold: passive transport                  
Gold: transport                          
Gold: diffusion                          
Gold: cell                               
Gold: plasma membrane                    
Gold: concentration                      
Gold: substance                          
Gold: osmosis                            
Gold: autotroph                          
Gold: calvin cycle                       
Gold: light-dependent reaction           
Gold: chloroplast                        
                                          Pred: engineering
                                          Pred: oxygen
                                          Pred: chemistry
                                          Pred: photosynthesis  chloroplasts
                                          Pred: cross section
                                          Pred: solar energy
                                          Pred: medicine
                                          Pred: biology
          