In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-xsum')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-xsum')
device = 'cuda:0'
device = 'cpu'


Downloading: 100%|██████████| 1.51k/1.51k [00:00<00:00, 1.20MB/s]
Downloading:  65%|██████▍   | 1.05G/1.63G [00:51<00:22, 25.4MB/s]

In [1]:
from datasets import load_dataset
dataset = load_dataset("xsum", split='validation')


Using custom data configuration default
Reusing dataset xsum (/home/jcxu/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)


In [2]:
from tqdm import tqdm
print(dataset[:5])
dataset = dataset[:30]
for idx, x in enumerate(tqdm(dataset)):
    pass

100%|██████████| 3/3 [00:00<00:00, 5161.16it/s]






In [None]:
# util

import argparse
import logging
import os
import pickle
import random
import statistics
import sys
from datetime import datetime
from typing import Dict, List
import multiprocessing
import torch
from datasets import load_dataset
from transformers import BartForConditionalGeneration, BartModel, BartTokenizer
import numpy as np
import pandas as pd

now = datetime.now()

logger = logging.getLogger('sum')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler(f"{now.strftime('%m')}{now.strftime('%d')}.html")
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter = logging.Formatter('<br>%(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
# add the handlers to logger
logger.addHandler(ch)
logger.addHandler(fh)

from util import *

from transformers import BartForConditionalGeneration, BartTokenizer

def write_pkl_to_disk(path: str, fname_prefix: str, data_obj):
    full_fname = os.path.join(path, f"{fname_prefix}.pkl")
    with open(full_fname, 'wb') as fd:
        pickle.dump(data_obj, fd)
    logging.debug(f"Done writing to {full_fname}")


def init_bart_sum_model(mname='sshleifer/distilbart-cnn-6-6', device='cuda:0'):
    model = BartForConditionalGeneration.from_pretrained(mname).to(device)
    tokenizer = BartTokenizer.from_pretrained(mname)
    return model, tokenizer

def bart_decoder_forward_embed(input_ids, embed_tokens, embed_scale):
    input_shape = input_ids.size()
    input_ids = input_ids.view(-1, input_shape[-1])
    inputs_embeds = embed_tokens(input_ids) * embed_scale
    return inputs_embeds


def summarize_attributions(attributions):
    attributions = attributions.mean(dim=-1)
    attributions = attributions / torch.norm(attributions)
    return attributions

def forward_enc_dec_step(model, encoder_outputs, decoder_inputs_embeds):
    # expanded_batch_idxs = (
    #         torch.arange(batch_size)
    #             .view(-1, 1)
    #             .repeat(1, 1)
    #             .view(-1)
    #             .to(device)
    #     )
    # encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select(
    #         0, expanded_batch_idxs
    #     )
    model_inputs = {"input_ids": None,
                    "past_key_values": None,
                    "encoder_outputs": encoder_outputs,
                    "decoder_inputs_embeds": decoder_inputs_embeds,
                    }
    outputs = model(**model_inputs, use_cache=False,
                    return_dict=True, output_attentions=True)
    return outputs


def init_bart_family(name_lm, name_sum, device, no_lm=False, no_ood=False):
    if not no_lm:
        lm_model, tok = init_bart_lm_model(name_lm, device)
    else:
        lm_model = None
    sum_model, tok = init_bart_sum_model(name_sum, device)
    if not no_ood:
        if name_sum == "facebook/bart-large-cnn": 
            sum_out_of_domain, _ = init_bart_sum_model(
            "facebook/bart-large-xsum", device)
        else:
            sum_out_of_domain, _ = init_bart_sum_model(
            "facebook/bart-large-cnn", device) 
    else:
        sum_out_of_domain = None
    return lm_model, sum_model, sum_out_of_domain, tok

from captum.attr._utils.visualization import format_word_importances


def simple_viz_attribution(tokenizer, input_ids, attribution_scores):
    token_in_list = input_ids.tolist()
    if isinstance(token_in_list[0], list):
        token_in_list = token_in_list[0]
    words = [tokenizer.decode(x) for x in token_in_list]
    attribution_scores_list = attribution_scores.tolist()
    # for w, ascore in zip(words, attribution_scores_list):
    #     logging.info('{:10} {:02.2f}'.format(w, ascore))

    output = format_word_importances(words, attribution_scores_list)
    return output


@torch.no_grad()
def run_full_model_slim(model, input_ids, attention_mask=None, decoder_input_ids=None, targets=None, device='cuda:0', output_dec_hid=False, output_attentions=False, T=1, special_attn=False):
    decoder_input_ids = decoder_input_ids.to(device)
    input_ids = input_ids.to(device)
    if attention_mask is not None:
        attention_mask = attention_mask.to(device)
    assert decoder_input_ids.size()[0] == input_ids.size()[0]

    model_inputs = {"input_ids": input_ids,
                    "attention_mask": attention_mask,
                    "decoder_input_ids": decoder_input_ids,
                    }

    outputs = model(**model_inputs,
                    output_hidden_states=output_dec_hid, output_attentions=output_attentions,
                    use_cache=False, return_dict=True)

    # batch, dec seq, vocab size
    next_token_logits = outputs.logits[:, -1, :]
    if targets is not None:
        targets = targets.to(device)
        loss = torch.nn.functional.cross_entropy(
            input=next_token_logits, target=targets, reduction='none')
    else:
        loss = 0
    if special_attn:
        cross_attn = outputs['cross_attentions']
        attn = cross_attn[-1][:, :, -1, :]
        # batch, nhead, enc_len
        mean_attn = torch.mean(attn, dim=1)
        # block special positions in input
        mask = (input_ids >= 5).float()
        mean_attn = mean_attn * mask
        return mean_attn[0] 
    if output_attentions:
        # use cross attention as the distribution
        # last layer.   batch=1, head, dec len, enc len
        # by default we use the last layer of attention
        output, p = get_cross_attention(
            outputs['cross_attentions'], input_ids, device=device)
        return output, p

    
    prob = torch.nn.functional.softmax(next_token_logits/T, dim=-1)
    # prob = next_token_logits.softmax(dim=-1)
    next_token = torch.argmax(next_token_logits, dim=-1)
    # next_token = next_token.unsqueeze(-1)
    next_token = next_token.tolist()    # confrim nested list?
    # print(f"Gold: {tokenizer.decode(targets[0].item())}")
    output = [tokenizer.decode(tk) for tk in next_token]
    # logging.info(f"Next token: {output}")
    # outputs['output'] = output
    return output, prob, next_token_logits, loss
from scipy.stats import entropy

In [None]:
ARTICLE_TO_SUMMARIZE = "Police in suburban Minneapolis shot and killed a man when what where who was allegedly involved in a carjacking and fired shots at pursuing officers, according to a release from the Burnsville Police Department. When, when."
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
# Generate Summary
summary_ids = model.generate(inputs['input_ids'], num_beams=1, max_length=100, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

summary_ids = model.generate(inputs['input_ids'], num_beams=10, max_length=100, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

summary_ids = model.generate(inputs['input_ids'], num_beams=100, max_length=100, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

In [None]:

def yield_fact_examples_from_curated_artifact(path_hall = '/mnt/data1/jcxu/back_to_fact/artifact',file_hall='artifact_hallucination.json'):
    import json
    with open(os.path.join(path_hall,file_hall), 'r') as fd:
        hall_data = json.load(fd)
    yield hall_data
yielder = yield_fact_examples_from_curated_artifact()

In [None]:
device='cpu'

for hall_case in next(yielder):

    document = hall_case['input']
    doc_input_ids = tokenizer(document, return_tensors='pt')['input_ids'][:,:600]
    prefix = hall_case['prefix']
    prefix_token_ids = tokenizer(prefix)['input_ids'][:-1]
    
    logger.info("*"*100)
    logger.info(f"<strong>Prefix: {prefix[1:]}</strong>")
    last_pred = None
    for t in range(10):
        logger.info('-'*10)
        if last_pred:
            prefix_token_ids += last_pred
            
        # target_token = tokens[idx]
        # tgt_token_id = tokenizer(" " +target_token)['input_ids'][1]

        logger.info(f"<<strong>>Prefix: {tokenizer.decode(prefix_token_ids,skip_special_tokens=True)}\t </strong>")
        # prefix_ids = tokenizer(' '.join(prefix_tokens),return_tensors='pt')
        decoder_input_ids = torch.tensor([prefix_token_ids], dtype=torch.long)
        # print(decoder_input_ids)
        # print(doc_input_ids)
        output, prob, next_token_logits, loss = run_full_model_slim(model=model, input_ids=doc_input_ids, decoder_input_ids=decoder_input_ids, device=device)
        # entropy
        squeeze_prob = prob.squeeze()
        show_top_k(squeeze_prob, tokenizer= tokenizer)
        ent_of_pred = entropy(squeeze_prob.cpu().numpy())
        logger.info(f"Entropy: {ent_of_pred: 8.2f}")
        last_pred = [torch.argmax(squeeze_prob).tolist()]

In [None]:

document = "The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google’s BERT model released in 2018. It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with much larger mini-batches and learning rates."
device = 'cpu'
doc_input_ids = tokenizer(document, return_tensors='pt')['input_ids'][:,:600]
decoder_input_ids = torch.tensor([[0]], dtype=torch.long)
# print(doc_input_ids.size())
# print(decoder_input_ids.size())

output, prob, next_token_logits, loss = run_full_model_slim(model=model, input_ids=doc_input_ids, decoder_input_ids=decoder_input_ids, device=device)


In [None]:
# load spacy
import spacy

nlp = spacy.load("en_core_web_sm")

def get_pos(inp_str):
    inp_str = inp_str.strip()
    doc = nlp(inp_str)
    for token in doc:
        return token.pos_
        # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)

In [None]:
# search
from scipy.stats import entropy
from typing import List

class SingleStep():
    def __init__(self, parent, t, prob, entropy, token="", token_id=0):
        self.parent = parent
        self.t = t
        self.prob=prob
        self.entropy=entropy
        self.token = token
        self.token_id = token_id
        self.is_compressive = False     # a new and blue job vs a job
        self.is_finished = False
    def classify_fact(self):
        pass

    def understand_word_type(self):
        pass

    def get_future_prediction(self):
        pass

    def get_token_ids(self):
        if self.parent == None:
            return [self.token_id]
        else:
            return self.parent.get_token_ids() + [self.token_id]

    def __repr__(self):
        return f"Generate Path: {tokenizer.decode(self.get_token_ids(),skip_special_tokens=True)} "

    def run(self):
        # expand the node, return a list of new nodes (sorted by the score)
        prefix_token_ids = self.get_token_ids()
        decoder_input_ids = torch.tensor([prefix_token_ids], dtype=torch.long)
        next_step_gen = step_generation(model, doc_input_ids,decoder_input_ids,device)
        is_collide = entity_collide(next_step_gen, prefix_token_ids)
        if is_collide:
            return []

        # predict even further TODO

        # populate
        rt_list = []
        logger.info(f"Populating...")
        logger.info(f"Prefix: {tokenizer.decode(prefix_token_ids,skip_special_tokens=True)}")
        for element in next_step_gen:
            new_ele = SingleStep(parent=self,t=self.t+1, prob=element['prob'], entropy=element['ent'], token=element['token'], token_id=element['token_id'])
            if element['token'].strip() == '.' or element['token'] == '</s>':
                new_ele.is_finished = True
            rt_list.append(new_ele)
            logger.info(f"\t--- Expansion: {element['token']}")
        logger.info(f"------")
        return rt_list[::-1]

ban_list = ['PROPN','NUM']
def entity_collide(generated_group:List, prefix_token_ids):
    # a bunch of collided entities / hallucination
    # could be a more complicated fact classifier
    cand_entropy = generated_group[0]['ent']
    if cand_entropy < 3:
        return False
    
    poss = [x['pos'] for x in generated_group]
    num_ban_pos = sum([ 1 for x in poss if x in ban_list])
    ban_rate = num_ban_pos / len(generated_group)
    if ban_rate > 0.4:
        logger.info(f"Entity collides! Entropy: {cand_entropy} Prefix:{tokenizer.decode(prefix_token_ids,skip_special_tokens=True)}")
        logger.info(f"{[x['token'] for x in generated_group]}")
        return True
    logger.info(f"HIgh entropy (wo Collision) Entropy: {cand_entropy} Prefix: {tokenizer.decode(prefix_token_ids,skip_special_tokens=True)}")
    logger.info(f"{[x['token'] for x in generated_group]}")
    return False

def step_generation(model, doc_input_ids, decoder_input_ids, device, acc_prob_threshold = 0.8, max_beam_size = 5):
    # print(doc_input_ids.size())
    # print(decoder_input_ids.size())
    output, prob, next_token_logits, loss = run_full_model_slim(model=model, input_ids=doc_input_ids, decoder_input_ids=decoder_input_ids, device=device)
    ent = entropy(prob.squeeze().cpu().numpy())
    acc_prob = 0
    top_prob, top_index = torch.topk(input=prob, k=max_beam_size)
    top_prob = top_prob.squeeze().tolist()
    top_index = top_index.squeeze().tolist()
    return_pairs = []
    for idx in range(max_beam_size):
        value = top_prob[idx]
        token_id = top_index[idx]
        token = tokenizer.decode(token_id)
        return_pairs.append(
            {'prob':value,
            'token':token,
            'token_id':token_id,
            'ent':ent,
            'pos':get_pos(token)
            })
        # if the accumulated prob > threshold, stop iteration
        acc_prob += value
        if acc_prob > acc_prob_threshold:
            break
    # return something with token tokenid and prob
    return return_pairs

def future_generation(prefix, future_step=3):

    pass



In [None]:
# init

document = "The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google’s BERT model released in 2018. It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with much larger mini-batches and learning rates."
device = 'cpu'
doc_input_ids = tokenizer(document, return_tensors='pt')['input_ids'][:,:600]

stack = []
# print(tokenizer.bos_token)
# print(tokenizer.bos_token_id)
init_state = SingleStep(None, 0, prob=1.0, entropy=1e-3, token=tokenizer.bos_token, token_id=tokenizer.bos_token_id)
stack.append(init_state)

In [None]:

finished_output = []
T = 200
t = 0
max_steps = 20
while stack:
    work_node = stack.pop()
    return_output = work_node.run()
    for rt in return_output:
        if rt.is_finished or rt.t >= max_steps:
            finished_output.append(rt)
        else:
            # stack.insert(0,rt)
            stack.append(rt)
    t+=1
    if t >T:
        break

logger.info("Finished output")
[logger.info(x) for x in finished_output]

In [None]:
from datasets import load_dataset
dataset_xsum = load_dataset('xsum',split='validation')

    

In [None]:
cnt = 50
device = 'cuda:0'
model = model.to(device=device)
for data in dataset_xsum:
    document = data['document']
    summary = data['summary']
    logger.info('*'*50)
    logger.info(f"Summary: {summary}")
    # init
    doc_input_ids = tokenizer(document, return_tensors='pt')['input_ids'][:,:600]

    stack = []
    init_state = SingleStep(None, 0, prob=1.0, entropy=1e-3, token=tokenizer.bos_token, token_id=tokenizer.bos_token_id)
    stack.append(init_state)

    finished_output = []
    T = 200
    t = 0
    max_steps = 20
    while stack:
        work_node = stack.pop()
        return_output = work_node.run()
        for rt in return_output:
            if rt.is_finished or rt.t >= max_steps:
                finished_output.append(rt)
            else:
                # stack.insert(0,rt)
                stack.append(rt)
        t+=1
        if t > T:
            break

    logger.info("Finished output")
    [logger.info(x) for x in finished_output]
    cnt -= 1
    if cnt <=0:
        break

In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
import logging
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-xsum")

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-xsum")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [8]:
print(tokenizer.eos_token)
print(tokenizer.pad_token)

</s>
<pad>


In [11]:

ARTICLE_TO_SUMMARIZE = "It is one of the most prestigious honors bestowed upon journalists and people in the arts. And today, the Pulitzer prize for journalism went to The Post and Courier newspaper of Charleston, South Carolina  - which has a tiny staff of just 80 and a daily circulation of 85,000. The paper's powerful photo series entitled 'Till Death Do Us Part,' on domestic violence scooped the top award for its exploration into why South Carolina is among the deadliest states for women. Winner: This iconic photo by New York Times photographer Daniel Berehulak, was part of a winning series, and shows James Dorbor, 8, suspected of being infected with Ebola, being carried by medical staff to an Ebola treatment center in Monrovia, Liberia. Death: This photograph released by the Pulitzer Board on 20 April 2015 shows one of the winning photographs by Daniel Berehulak who won the 2015 Pulitzer Prize for Feature Photography. </s> One small newspaper"

# ARTICLE_TO_SUMMARIZE =""

inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')

# Generate Summary
summary_ids = model.generate(inputs['input_ids'], num_beams=10, min_length=60, max_length=60, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

["One of the world's smallest newspapers has won one of the most prestigious awards in the world of journalism - the Pulitzer Prize for Feature Photography, it has been announced on the eve of the awards ceremony in Washington, D.C. and the winners will be announced in New York on Monday."]


In [1]:

ARTICLE_TO_SUMMARIZE = "It is one of the most prestigious honors bestowed upon journalists and people in the arts. And today, the Pulitzer prize for journalism went to The Post and Courier newspaper of Charleston, South Carolina  - which has a tiny staff of just 80 and a daily circulation of 85,000. The paper's powerful photo series entitled 'Till Death Do Us Part,' on domestic violence scooped the top award for its exploration into why South Carolina is among the deadliest states for women. Winner: This iconic photo by New York Times photographer Daniel Berehulak, was part of a winning series, and shows James Dorbor, 8, suspected of being infected with Ebola, being carried by medical staff to an Ebola treatment center in Monrovia, Liberia. Death: This photograph released by the Pulitzer Board on 20 April 2015 shows one of the winning photographs by Daniel Berehulak who won the 2015 Pulitzer Prize for Feature Photography"

# ARTICLE_TO_SUMMARIZE =""

inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')

# Generate Summary
summary_ids = model.generate(inputs['input_ids'], num_beams=10, min_length=5, max_length=5, early_stopping=True,num_return_sequences=100)

NameError: name 'tokenizer' is not defined