In [1]:
import transformers
from transformers import BloomForQuestionAnswering
from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast
import torch

from os.path import exists
from os.path import isdir

In [2]:
model = BloomForCausalLM.from_pretrained("bigscience/bloom-7b1") #remove -1b7 for whole modelb
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")

Downloading:   0%|          | 0.00/734 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/27.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [4]:
prompt = "It was a dark and stormy night"
result_length = 50
inputs = tokenizer(prompt, return_tensors="pt") #??

In [5]:
# Greedy Search
print(tokenizer.decode(model.generate(inputs["input_ids"], 
                       max_length=result_length
                      )[0]))

It was a dark and stormy night, and the wind howled and howled, and the
storm was so great that the sea was rolling in great waves, and the
ship was tossing and rolling, and the sea was so rough


# Load train and validation data

In [2]:
def error(msg):
    print('  [\033[91mx\033[0m] ' + msg)
    exit(1)

def success(msg):
    print('  [\033[92mo\033[0m] ' + msg)
    
def load_json_lines(f):
    if not exists(f):
        error('The file "' + f + '" does not exist.')

    ret = []
    num = 1
    
    if isdir(f):
        f = glob(f + '/*.json*')
        
        if len(f) != 1:
            error('The input is an directory that contains multiple json files. Please create only a single json file. Got ' + str(f))
        
        f = f[0]
    
    with open(f, 'r') as inp:
        for l in inp:
            try:
                ret += [json.loads(l)]
            except:
                error('Invalid line ' + str(num) + ' in "' + f + '" with content: ' + l.strip())
            num += 1

    success('The file ' + f + ' is in JSONL format.')
    return ret

In [3]:
train = load_json_lines('train.jsonl')
validation = load_json_lines('validation.jsonl')
test = load_json_lines('input.jsonl')

  [[92mo[0m] The file train.jsonl is in JSONL format.
  [[92mo[0m] The file validation.jsonl is in JSONL format.
  [[92mo[0m] The file input.jsonl is in JSONL format.


In [5]:
#Function for creating prompt from entry

def make_prompt(entry):
    context = ' '.join(entry['targetParagraphs'])
    clickbait = entry['postText'][0]
    prompt = f"Context: {context}\n\nQuestion: {clickbait}\n\nAnswer: "
    
    ##
    
    #if '?' in clickbait:
    #    prompt = f"Context: {context}\n\Question: {clickbait} \n\nAnswer:"  ## Statement version
    #else:
    #    prompt = f"Context: {context}\n\n{clickbait}:"
    return prompt
    
prompt = make_prompt(train[38])
inputs = tokenizer(prompt, return_tensors="pt") #??

In [6]:
predictions_from_prompts = []
prompts = [make_prompt(x) for x in validation]

for i, prompt in enumerate(prompts):
    print(i,"/",len(prompts))
    inputs = tokenizer(prompt, return_tensors="pt")
    prediction = tokenizer.decode(model.generate(inputs["input_ids"], 
                           max_length=len(inputs['input_ids'][0])+20
                          )[0])
    anchor = prompt[-15:]
    pred = prediction[prediction.find(anchor)+15:].strip('</s>').strip()
    if '\n' in pred:
        #pred = pred.replace('\n', ' ')
        pred = [x for x in pred.split('\n') if len(x)!=0][0]
    predictions_from_prompts.append(pred)
    print('title:',validation[i]['postText'][0],'\nprediction:',pred, '\ntarget:', validation[i]['spoiler'][0], 
          '\nhuman target:',validation[i]['provenance']['humanSpoiler'])

0 / 800
title: Five Nights at Freddy’s Sequel Delayed for Weird Reason 
prediction: The game is being delayed because it’s too dark. 
target: some of the plot elements are so disturbing that they are making him feel sick 
human target: They think It's too dark. #StopClickBait
1 / 800
title: Why Arizona Sheriff Joe Arpaio’s fate could hang on a single word 
prediction: The judge found that Arpaio and his aides had engaged in multiple acts of misconduct, di 
target: "intentionally" 
human target: "Intentionally", because it's hard to judge intentionality.
2 / 800
title: Here’s how much you should be tipping your hairdresser 
prediction: The amount you should tip your hairdresser depends on the service you received and the length of time 
target: 20% 
human target: None
3 / 800
title: "Harry Potter" alums reunite for new movie 
prediction: The cast of "Harry Potter" reunites for a new movie. 
target: Alan Rickman & Rupert Grint 
human target: Alan Rickman and Rupert Grint in "CBGB"
4 / 80

In [70]:
import pickle

with open('prompt_spoilers_7b1.dat', 'wb') as f:
    pickle.dump(predictions_from_prompts, f)

In [16]:
i = 38 #37 is good
train[i]['postText'], train[i]['spoiler'], train[i]['provenance']['humanSpoiler']

(['This Is What Bill Clinton Will Actually Be Called If Hillary Becomes President'],
 ['the first gentleman'],
 'The First Gentleman')

In [51]:
train[0].keys()

dict_keys(['uuid', 'postId', 'postText', 'postPlatform', 'targetParagraphs', 'targetTitle', 'targetDescription', 'targetKeywords', 'targetMedia', 'targetUrl', 'provenance', 'spoiler', 'spoilerPositions', 'tags'])

# BLOOM For Question-Answering

In [4]:
from transformers import pipeline, AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained('deepset/roberta-base-squad2')
question_answering = pipeline("question-answering", model="deepset/roberta-base-squad2", tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [6]:
spoilers_from_qa = []
for i, entry in enumerate(validation):
    print(i,'/',len(validation))
    context = ' '.join(entry['targetParagraphs'])
    question = entry['postText'][0]
    result = question_answering(question=question, context=context)

    print('title:',entry['postText'][0],'\nprediction:',result['answer'])
    spoilers_from_qa.append(result['answer'])

0 / 800
title: Five Nights at Freddy’s Sequel Delayed for Weird Reason 
prediction: too dark
1 / 800
title: Why Arizona Sheriff Joe Arpaio’s fate could hang on a single word 
prediction: intentionally
2 / 800
title: Here’s how much you should be tipping your hairdresser 
prediction: slightly smaller tip
3 / 800
title: "Harry Potter" alums reunite for new movie 
prediction: Michael Gambon
4 / 800
title: A man swallowed a microSD card and you won't believe what happened next! 
prediction: John Doe
5 / 800
title: This popular soda could cure your hangovers scientists say: 
prediction: Sprite
6 / 800
title: The anytime snack you won't feel guilty about eating 
prediction: midnight
7 / 800
title: You won't believe this stunning "Harry Potter" revelation about Professor McGonagall 
prediction: she had been teaching at Hogwarts for "Thirty-nine years this December".
8 / 800
title: J.J. Abrams has an answer on if there will be a post-credits scene in the new 'Star Wars' 
prediction: No, there’

In [13]:
import pickle
with open('roberta_predictions_test_data.dat', 'wb') as f:
    pickle.dump(spoilers_from_qa, f)

In [22]:
 # Example of fine-tuning BLOOM https://www.kaggle.com/code/julianschelb/finetune-bloom-token-classification

In [42]:
entry['spoiler'][0]
context.find('how about that morning we go throw?')

772

In [44]:
context[772:807]

'how about that morning we go throw?'

In [68]:
len(train)

25600

In [7]:
import pandas as pd
df = pd.read_json('validation.jsonl', lines=True)

In [8]:
spoilers = df[["uuid","postId"]]
spoilers['spoiler'] = spoilers_from_qa

json_output = spoilers.to_json(orient='records', lines=True)

with open(f"roberta_test_predictions.jsonl", 'w') as f:
    f.write(json_output)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spoilers['spoiler'] = spoilers_from_qa


In [19]:
json_output

'{"uuid":"2f3e30d7-972c-4812-b727-6b582de52137","postId":"420585068076101633","spoiler":"chai tea soap"}\n{"uuid":"45425f74-4c0e-420c-aea8-6337ea91e107","postId":"4km87q","spoiler":"Gwyneth Paltrow"}\n{"uuid":"2fb84020-cfdb-4b7f-9cfc-b891c306c946","postId":"4s4f3b","spoiler":"rubbish"}\n{"uuid":"7cc8b816-a3fe-4b3f-9568-b353f87a3161","postId":"429398501613895680","spoiler":"JaVale McGee"}\n{"uuid":"b90a7343-ab27-4750-8e12-43f3f03fc9e5","postId":"828292353706176512","spoiler":"Alex Owens-Sarno"}\n{"uuid":"1b83a7b8-72fa-4b21-8dd3-a5cff484d791","postId":"832816169686069248","spoiler":"They just weren\\u2019t quite as intense"}\n{"uuid":"b63a63ce-38be-498d-8eca-f571d713e05c","postId":"4t7tae","spoiler":"insecurity"}\n{"uuid":"fd5ef1e7-63ce-499c-9fc3-00d5209ccef6","postId":"5195lx","spoiler":"crazy, but after you watch this video from Allrecipes"}\n{"uuid":"3802f457-cee5-4324-80a0-161cc387785b","postId":"812568310223175681","spoiler":""}\n{"uuid":"2be7bbfb-5c97-4b1e-925d-8d76fff2d9da","postI