In [1]:
from collections import Counter
from datasets import load_dataset

dataset = load_dataset("xsum")
train_set = dataset['train']
dev_set = dataset['validation']

print(train_set)

Using custom data configuration default
Reusing dataset xsum (/home/jcxu/.cache/huggingface/datasets/xsum/default/1.2.0/f9abaabb5e2b2a1e765c25417264722d31877b34ec34b437c53242f6e5c30d6d)


Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 204045
})


In [16]:
# Build a unigram and bigram dictionary
# map: unigram -> doc sent,  eg. "Obama" -> { "he is obama", "you are obama", ....} up to 100 sentences for each token

# map: unigram -> summary bigrams. eg. "of" -> "half_of": count=12, "some_of" count=15, .....
# map: unigram -> summary trigrams(future). eg. "about_half": "about_half_of" cnt=10, "about_half_employee" cnt=2

# The bigram and trigrams must show up more than K=5 times to be stored

MAX_SENT_FOR_TOKEN = 100
import re
WORD = re.compile(r'\w+')
def reg_tokenize(text):
    words = WORD.findall(text)
    return words
from typing import Dict
def trim_counter_in_dict(dict_cnts:dict, K=3):
    trimed = {}
    for k,v in dict_cnts.items():
        v = {x: count for x, count in v.items() if count >= K}
        if len(v)>0:
            trimed[k] =v
    return trimed


def cache_feat_from_corpus(one_set,debug=False,min_cnt=3):
    cnt = 0
    map_tok_bigram_ref = {}
    map_tok_sent_doc = {}
    map_tok_trigram_ref = {}
    for example in one_set:
        cnt += 1
        document  = example['document']
        summary = example['summary']

        # LM backward bigram
        sum_unigram, sum_bigram = func_ngrams(summary,n=2)
        for bigram_pair in sum_bigram:
            key_word = bigram_pair[1:]
            key_word = "_".join(key_word)
            concat_pair = "_".join(bigram_pair)
            if key_word in map_tok_bigram_ref:
                cnter: Counter = map_tok_bigram_ref[key_word]
                cnter.update([concat_pair])
                map_tok_bigram_ref[key_word] = cnter
            else:
                map_tok_bigram_ref[key_word] = Counter([concat_pair])

        # LM trigrams   forward
        sum_unigram, sum_trigram = func_ngrams(summary,n=3)
        for trigram_pair in sum_trigram:
            key_word = trigram_pair[:-1]
            key_word = "_".join(key_word)
            concat_pair = "_".join(trigram_pair)
            if key_word in map_tok_trigram_ref:
                cnter: Counter = map_tok_trigram_ref[key_word]
                cnter.update([concat_pair])
                map_tok_trigram_ref[key_word] = cnter
            else:
                map_tok_trigram_ref[key_word] = Counter([concat_pair])


        document_sents = document.split('\n')
        document_sents = document_sents[:5] # let's just trim to the first few sentences.
        for doc_sent in document_sents:
            doc_unigram, doc_bigram = func_ngrams(doc_sent,n=2)
            for doc_tok in doc_unigram:
                if doc_tok in map_tok_sent_doc:
                    current_list_of_sents = map_tok_sent_doc[doc_tok]
                    if len(current_list_of_sents)> MAX_SENT_FOR_TOKEN:
                        continue
                    new_tok_sentences = map_tok_sent_doc[doc_tok] + [doc_sent]
                    map_tok_sent_doc[doc_tok] = new_tok_sentences
                else:
                    map_tok_sent_doc[doc_tok] = [doc_sent]

        if debug:
            if cnt > 10000:
                break
    # print(map_tok_bigram_ref)
    map_tok_bigram_ref = trim_counter_in_dict(map_tok_bigram_ref)
    print(map_tok_bigram_ref)
    map_tok_trigram_ref = trim_counter_in_dict(map_tok_trigram_ref)
    return  map_tok_bigram_ref, map_tok_trigram_ref, map_tok_sent_doc

def func_ngrams(inp_str, n=2):
    inp_str=inp_str.lower()
    input = reg_tokenize(inp_str)
    # always return tokens
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return input, output

map_tok_bigram_ref,map_tok_trigram_ref, map_tok_sent_doc = cache_feat_from_corpus(train_set,debug=True)
print(map_tok_trigram_ref)



In [24]:
# unconditional Bidirectional LM
from transformers import pipeline
unmasker = pipeline('fill-mask', model='distilbert-base-cased')
# unmasker = pipeline('fill-mask', model='google/electra-small-discriminator')
output = unmasker("It is a [MASK]")
print(output)

[{'sequence': '[CLS] It is a. [SEP]', 'score': 0.04926545172929764, 'token': 119, 'token_str': '.'}, {'sequence': '[CLS] It is a function [SEP]', 'score': 0.01850147731602192, 'token': 3053, 'token_str': 'function'}, {'sequence': '[CLS] It is a constant [SEP]', 'score': 0.012798933312296867, 'token': 4836, 'token_str': 'constant'}, {'sequence': '[CLS] It is a problem [SEP]', 'score': 0.012675427831709385, 'token': 2463, 'token_str': 'problem'}, {'sequence': '[CLS] It is a map [SEP]', 'score': 0.009161712601780891, 'token': 4520, 'token_str': 'map'}]


In [19]:
# configure Bidirectional LM and unidirectional LM
from transformers import pipeline
bi_unmasker = pipeline('fill-mask', model='distilbert-base-uncased',device=0)

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True)
uni_unmasker = model.to(torch.device('cuda:0'))

def mask_fill_bi(unmasker, inp_text, str_of_interest):
    inp_text = inp_text.replace(f" {str_of_interest}", ' [MASK]')
    output = unmasker(inp_text)
    print(output)

mask_fill_bi(bi_unmasker,"Barack Hussein Obama II is an American politician and attorney who served as the 44th president of the United States from 2009 to 2017.","to")

[{'sequence': '[CLS] barack hussein obama ii is an american politician and attorney who served as the 44th president of the united states from 2009 to 2017. [SEP]', 'score': 0.40714916586875916, 'token': 2000, 'token_str': 'to'}, {'sequence': '[CLS] barack hussein obama ii is an american politician and attorney who served as the 44th president of the united states from 2009 until 2017. [SEP]', 'score': 0.3334546685218811, 'token': 2127, 'token_str': 'until'}, {'sequence': '[CLS] barack hussein obama ii is an american politician and attorney who served as the 44th president of the united states from 2009 - 2017. [SEP]', 'score': 0.14666640758514404, 'token': 1011, 'token_str': '-'}, {'sequence': '[CLS] barack hussein obama ii is an american politician and attorney who served as the 44th president of the united states from 2009 – 2017. [SEP]', 'score': 0.05319356918334961, 'token': 1516, 'token_str': '–'}, {'sequence': '[CLS] barack hussein obama ii is an american politician and attorney

In [22]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True)

tensor([-77.4425, -80.4463, -88.0498,  ..., -96.2564, -93.6345, -84.0666],
       grad_fn=<SelectBackward>)


In [None]:
txt = "Hello, this is a "


inputs = tokenizer(txt, return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"])
logits = outputs.logits
logits = logits[0,-1]
top = torch.topk(logits,k=5)
indices = top.indices.tolist()
for idx in indices:
    print(tokenizer.decode(idx))

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

# model_name = 'google/pegasus-xsum'
model_name = 'facebook/bart-large-xsum'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(torch_device)
# tokenizer = PegasusTokenizer.from_pretrained(model_name)
# model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)


from transformers import BartTokenizer, BartForConditionalGeneration
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-xsum')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-xsum')


cuda





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1362.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1625270765.0, style=ProgressStyle(descr…

In [None]:
from lib.run_lime import run_model
batch_for_model = []
dec_summaries = []
for idx in range(max_samples):
    inp_doc, ref_sum = dev_set['document'][idx],dev_set['summary'][idx]
    batch_for_model.append(inp_doc)
    if len(batch_for_model) == batch_size:
        model_output = run_model(model, tokenizer, batch_for_model, device=torch_device)
        dec_summaries += model_output
        batch_for_model = []
if len(batch_for_model) != 0:
    model_output = run_model(model, tokenizer, batch_for_model, device=torch_device)
    dec_summaries += model_output
    batch_for_model = []
print("\n".join(dec_summaries)  )

In [None]:
summaries = []
docs = []
output = []
cnt = 0
for idx in range(max_samples):
    inp_doc, ref_sum = dev_set['document'][idx],dev_set['summary'][idx]
    summaries.append(ref_sum)
    docs.append(inp_doc)
    cnt +=1
    if cnt>max_samples:
        break
print("".join(docs))
# print("".join(summaries))
# print("\n".join(output))

In [None]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,ner')
s = 'Parts of the famous Staffordshire Hoard will be exhibited at Tamworth Castle next year.'
doc = nlp(s )
print(*[f'token: {token.text}\tner: {token.ner}' for sent in doc.sentences for token in sent.tokens], sep='\n')


In [None]:

from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.03.24.tar.gz")
predictor.predict(
  sentence="Did Uriah honestly think he could beat the game in under three hours?"
)

In [28]:
from transformers import GPT2Tokenizer
gpt_tok = GPT2Tokenizer.from_pretrained('gpt2')

from transformers import BartTokenizer
bart_tok = BartTokenizer.from_pretrained('facebook/bart-large')
bart_tok_xsum = BartTokenizer.from_pretrained('facebook/bart-large-xsum')

from transformers import PegasusTokenizer
peg_tok = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1912529.0, style=ProgressStyle(descript…




In [29]:
print(gpt_tok)
print(bart_tok)
print(bart_tok_xsum)
print(gpt_tok)
# print(peg_tok)

PreTrainedTokenizer(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=False, padding_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})
PreTrainedTokenizer(name_or_path='facebook/bart-large', vocab_size=50265, model_max_len=1024, is_fast=False, padding_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': A