In [4]:
import transformers
from transformers import BloomForQuestionAnswering
from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast
import torch

from os.path import exists
from os.path import isdir

RuntimeError: Failed to import transformers.models.bloom.modeling_bloom because of the following error (look up to see its traceback):
module 'torch' has no attribute 'Tensor'

In [2]:
model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1") #remove -1b7 for whole modelb
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")

In [4]:
prompt = "It was a dark and stormy night"
result_length = 50
inputs = tokenizer(prompt, return_tensors="pt") #??

In [5]:
# Greedy Search
print(tokenizer.decode(model.generate(inputs["input_ids"], 
                       max_length=result_length
                      )[0]))

It was a dark and stormy night, and the wind howled and howled, and the
storm was so great that the sea was rolling in great waves, and the
ship was tossing and rolling, and the sea was so rough


# Load train and validation data

In [1]:
def error(msg):
    print('  [\033[91mx\033[0m] ' + msg)
    exit(1)

def success(msg):
    print('  [\033[92mo\033[0m] ' + msg)
    
def load_json_lines(f):
    if not exists(f):
        error('The file "' + f + '" does not exist.')

    ret = []
    num = 1
    
    if isdir(f):
        f = glob(f + '/*.json*')
        
        if len(f) != 1:
            error('The input is an directory that contains multiple json files. Please create only a single json file. Got ' + str(f))
        
        f = f[0]
    
    with open(f, 'r') as inp:
        for l in inp:
            try:
                ret += [json.loads(l)]
            except:
                error('Invalid line ' + str(num) + ' in "' + f + '" with content: ' + l.strip())
            num += 1

    success('The file ' + f + ' is in JSONL format.')
    return ret

In [2]:
train = load_json_lines('train.jsonl')
validation = load_json_lines('validation.jsonl')

NameError: name 'exists' is not defined

In [60]:
#Function for creating prompt from entry

def make_prompt(entry):
    context = ' '.join(entry['targetParagraphs'])
    clickbait = entry['postText'][0]
    #prompt = f"Context: {context}\n\nQuestion: {clickbait}\n\nAnswer: "
    if '?' in clickbait:
        prompt = f"Context: {context}\n\Question: {clickbait} \n\nAnswer:"
    else:
        prompt = f"Context: {context}\n\n{clickbait}:"
    return prompt
    
prompt = make_prompt(train[38])
inputs = tokenizer(prompt, return_tensors="pt") #??

In [61]:
prompt

'Context: If Hillary Clinton triumphs over Donald to become this nation’s next president, Bill Clinton will find himself in a position no man has ever been: the spouse of the President of the United States. And people can’t stop wondering what exactly his title will be. Hillary has been asked this question repeatedly during her campaign (which probably gets annoying), and she’s brought up an important point: Now, it’s a little bit more complicated with him because people still call former presidents Mr. President. So I have to really work on this. After all, you can’t be calling someone Mr. President every day right next to Madam President. Hillary mused on the issue with Jimmy Kimmel. First dude, first mate, first gentleman, I’m just not sure. But, more recently, they seem to have settled on a title. I’ve already told my husband that if I’m so fortunate enough to be president and he will be the first gentleman. I kind of love how she phrased it here, she did not ask him, she told him 

In [68]:
predictions_from_prompts = []
prompts = [make_prompt(x) for x in validation]

for i, prompt in enumerate(prompts):
    print(i,"/",len(prompts))
    inputs = tokenizer(prompt, return_tensors="pt")
    prediction = tokenizer.decode(model.generate(inputs["input_ids"], 
                           max_length=len(inputs['input_ids'][0])+20
                          )[0])
    anchor = prompt[-15:]
    pred = prediction[prediction.find(anchor)+15:].strip('</s>').strip()
    if '\n' in pred:
        #pred = pred.replace('\n', ' ')
        pred = [x for x in pred.split('\n') if len(x)!=0][0]
    predictions_from_prompts.append(pred)
    print('title:',validation[i]['postText'][0],'\nprediction:',pred, '\ntarget:', validation[i]['spoiler'][0], 
          '\nhuman target:',validation[i]['provenance']['humanSpoiler'])

0 / 800
title: Five Nights at Freddy’s Sequel Delayed for Weird Reason 
prediction: Five Nights at Freddy’s creator Scott Cawthon takes to Steam to tease a possible delay 
target: some of the plot elements are so disturbing that they are making him feel sick 
human target: They think It's too dark. #StopClickBait
1 / 800
title: Why Arizona Sheriff Joe Arpaio’s fate could hang on a single word 
prediction: intentionally 
target: "intentionally" 
human target: "Intentionally", because it's hard to judge intentionality.
2 / 800
title: Here’s how much you should be tipping your hairdresser 
prediction: Tipping your stylist is a personal choice, but it’s important to remember that tipping i 
target: 20% 
human target: None
3 / 800
title: "Harry Potter" alums reunite for new movie 
prediction: Rickman, Grint, Bartha, Adams 
target: Alan Rickman & Rupert Grint 
human target: Alan Rickman and Rupert Grint in "CBGB"
4 / 800
title: A man swallowed a microSD card and you won't believe what happen

In [70]:
import pickle

with open('prompt_spoilers_statements_7b.dat', 'wb') as f:
    pickle.dump(predictions_from_prompts, f)

In [16]:
i = 38 #37 is good
train[i]['postText'], train[i]['spoiler'], train[i]['provenance']['humanSpoiler']

(['This Is What Bill Clinton Will Actually Be Called If Hillary Becomes President'],
 ['the first gentleman'],
 'The First Gentleman')

In [51]:
train[0].keys()

dict_keys(['uuid', 'postId', 'postText', 'postPlatform', 'targetParagraphs', 'targetTitle', 'targetDescription', 'targetKeywords', 'targetMedia', 'targetUrl', 'provenance', 'spoiler', 'spoilerPositions', 'tags'])

# BLOOM For Question-Answering

In [5]:
from transformers import pipeline

In [6]:
question_answering = pipeline("question-answering", model="bigscience/bloom-7b1", tokenizer=tokenizer)

Some weights of BloomForQuestionAnswering were not initialized from the model checkpoint at bigscience/bloom-7b1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
spoilers_from_qa = []
for i, entry in enumerate(validation):
    print(i,'/',len(validation))
    context = ' '.join(entry['targetParagraphs'])
    question = entry['postText'][0]
    result = question_answering(question=question, context=context)

    print('title:',entry['postText'][0],'\nprediction:',result['answer'], '\ntarget:', entry['spoiler'][0], 
          '\nhuman target:',entry['provenance']['humanSpoiler'])
    spoilers_from_qa.append(result['answer'])

0 / 800




title: Five Nights at Freddy’s Sequel Delayed for Weird Reason 
prediction:  Cawthon said that some of the plot elements are so 
target: some of the plot elements are so disturbing that they are making him feel sick 
human target: They think It's too dark. #StopClickBait
1 / 800
title: Why Arizona Sheriff Joe Arpaio’s fate could hang on a single word 
prediction:  PHOENIX — A single word — "intentionally" 
target: "intentionally" 
human target: "Intentionally", because it's hard to judge intentionality.
2 / 800
title: Here’s how much you should be tipping your hairdresser 
prediction:  hairdresser has to be one of the more 
target: 20% 
human target: None
3 / 800
title: "Harry Potter" alums reunite for new movie 
prediction:  "chocolate" would 
target: Alan Rickman & Rupert Grint 
human target: Alan Rickman and Rupert Grint in "CBGB"
4 / 800
title: A man swallowed a microSD card and you won't believe what happened next! 
prediction:  swallowed the 
target: a man who swallowed a 64GB mi

In [9]:
import pickle

with open('spoilers_qa_pretrained_7b1.dat', 'wb') as f:
    pickle.dump(spoilers_from_qa, f)

In [22]:
 # Example of fine-tuning BLOOM https://www.kaggle.com/code/julianschelb/finetune-bloom-token-classification

In [42]:
entry['spoiler'][0]
context.find('how about that morning we go throw?')

772

In [44]:
context[772:807]

'how about that morning we go throw?'

In [68]:
len(train)

25600

In [60]:
for i in train:
    train2.append(i)

# Fine Tuning test

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-7b1")

In [8]:
def preprocess(data):
    questions = [x['postText'][0].strip() for x in data]
    contexts = [x['targetTitle'] + ' - ' + ' '.join(x['targetParagraphs']) for x in data]
    answers = [x['spoiler'][0] for x in data] #Extracted spoilers
    
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,   # Not sure which is best
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    offset_mapping = inputs.pop("offset_mapping")
    start_positions = [contexts[i].find(answers[i]) for i in range(len(questions))]
    end_positions = [start_positions[i]+len(answers[i]) for i in range(len(questions))]
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [9]:
tokenized_train = preprocess(train)
tokenized_valid = preprocess(validation)

from transformers import default_data_collator

data_collator = default_data_collator

In [10]:
type(tokenized_train)

transformers.tokenization_utils_base.BatchEncoding

In [11]:
from datasets import Dataset
#tokenized_train['label'] = tokenized_train['input_ids'].copy()
#tokenized_valid['label'] = tokenized_valid['input_ids'].copy()

dataset_train = Dataset.from_dict(tokenized_train)
dataset_valid = Dataset.from_dict(tokenized_valid)


In [12]:
len(dataset_train['input_ids'][0])

1024

In [13]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("bigscience/bloom-7b1")

Some weights of BloomForQuestionAnswering were not initialized from the model checkpoint at bigscience/bloom-7b1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    tokenizer=tokenizer,
    data_collator=data_collator
)



In [None]:
trainer.train()

***** Running training *****
  Num examples = 3200
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 9600
  Number of trainable parameters = 7069024258


Epoch,Training Loss,Validation Loss
