In [1]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import pandas as pd
import torch

In [2]:
model_name = "deepset/roberta-base-squad2"

In [3]:
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = nlp(QA_input)

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [141]:
QA_input = {
    'question': 'Why is model conversion liked?',
    'context': 'I am as asdf d it is cool'
}
res = nlp(QA_input)
res['answer']

'cool'

In [158]:
question = 'Why is model conversion liked?'
text = 'I am as asdf d it is cool'
input_ids = tokenizer.encode(question,text)


In [159]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)

In [160]:
output = model(torch.tensor([input_ids]))

In [161]:
tokens

['<s>',
 'Why',
 'Ġis',
 'Ġmodel',
 'Ġconversion',
 'Ġliked',
 '?',
 '</s>',
 '</s>',
 'I',
 'Ġam',
 'Ġas',
 'Ġas',
 'df',
 'Ġd',
 'Ġit',
 'Ġis',
 'Ġcool',
 '</s>']

In [156]:
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
Why is model conversion liked?

Answer:
<s>.


In [142]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /Users/oliver/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",


In [133]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

In [134]:
training_args

TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=test_trainer/runs/Feb2

In [132]:
from transformers import Trainer

In [139]:
trainer = Trainer(model=model, args=training_args, train_dataset=imdb_dataset['train'], tokenizer=tokenizer)

In [140]:
trainer.train()

***** Running training *****
  Num examples = 25000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9375


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['text', 'label']

In [27]:
data = pd.read_pickle('Documents/CS224N/Saved-You-A-Click-CS224N/data_full_pandas.pkl')

In [79]:
i = 24
QA_input = {
    'question': 'Why are new plant burgers not for vegans?',#data['teaser'][i],
    'context': data['article'][i]
}

In [80]:
res = nlp(QA_input)
res

{'score': 0.3877977728843689,
 'start': 485,
 'end': 512,
 'answer': 'because of how it is cooked'}

In [41]:
context = 'As fears over the coronavirus outbreak spread, thousands of Americans are clamoring to buy face masks in an effort to protect themselves, sending prices soaring and leading manufacturers like 3M to ramp up production. However, experts say stocking up on face masks is actually misguided — and there\'s a much simpler thing you could be doing right now to protect yourself.\n\nThere\'s a lot the general public likely doesn\'t realize about these masks — namely, that they are not the best way to prevent the spread of coronavirus.\n\nWearing a mask is more for people already showing symptoms of coronavirus and their caregivers than for people trying to prevent it\n\nThe Centers for Disease Control and Prevention said it "does not recommend that people who are well wear a facemask to protect themselves from respiratory diseases, including COVID-19," referring to the disease caused by the new coronavirus. Rather, experts caution that putting on a face mask without proper fitting and training could actually increase your risk.\n\n"If it\'s not fitted right, you\'re going to fumble with it," explained Health and Human Services Secretary Alex Azar before a House Appropriations subcommittee on Wednesday. "You\'re going to be touching your face, which is the No. 1 way you\'re going to get disease, is unclean hands touching your face."\n\nOn the other hand, if you are already coughing and showing symptoms of possible coronavirus illness, that\'s when wearing a mask could be helpful for protecting those around you.\n\n"The data on the effectiveness of masks for preventing respiratory virus infections is not very clear, " explains Dr. Andrew Stanley Pekosz of Johns Hopkins\' Bloomberg School of Public Health. "The best data suggests that if you are ill and showing symptoms, wearing a mask can reduce the chances that you spread the virus to others."\n\nCloth surgical masks are not helpful at all\n\nThe common surgical mask you might be picturing in your head will not help you at all, Pekosz said.\n\nA type called an N95 respirator mask, if properly fitted, can block large-particle droplets that may contain germs, but the FDA warns they cannot filter out "very small particles in the air that may be transmitted by coughs [or] sneezes."\n\n"An N95 mask is the one that is most practical," Pekosz tells CBS News. "It stops 95% of particles of a certain size. ... There is a N99 mask, which blocks 99% of particles, but that mask is difficult to wear for long periods of time because it is hard to breathe through it."\n\nRespirator masks are more expensive. The FDA also notes they are not designed to fit children or people with facial hair.\n\nEven a good face mask isn\'t enough\n\n"Masks shouldn\'t be considered to be the sole item that can protect you from infection, but it can be one of several things that can help you stay uninfected," said Pekosz.\n\n"Wash your hands frequently. Practice social distancing — stay 5 feet away from people to avoid being close enough to be exposed to respiratory droplets from that person. More specific guidance will be given by the CDC soon, but those two things should be practiced by people on a daily basis to reduce the spread of respiratory viruses."\n\nAnd he adds, "Get a flu shot — influenza has killed over 16,000 Americans this year and is still causing disease across the U.S."\n\nYou have to change masks every few hours\n\nIf you do go the mask route in spite of expert advice, it\'s important to note that face masks have a very specific lifespan. While there are some with longer lifespans or that have replaceable filters, the most common face masks on the market are disposable and single use. Each one of those is only good for a few hours.\n\n"You want to change masks every few hours to make sure that they are functioning properly and aren\'t getting contaminated with virus particles on the outside," Pekosz tells CBS News. "It\'s not like putting one on protects you. One has to follow specific procedures to ensure you are using them effectively."\n\nBuying face masks for personal use could cause a shortage at hospitals\n\n"There is a limited supply of masks and while companies are increasing their production, demand is increasing at a very high rate," cautions Pekosz. "There will most likely be shortages of personal protective equipment at medical institutions and this may in part be driven by supplies being purchased by the general public. Emergency preparedness efforts will address supply chains, but there really is no reason for the general public to purchase large numbers of N95 masks."\n\nAmerica\'s largest face mask manufacturer, Prestige Ameritech, is a small business based in Texas with only 100 employees. And while they have no problem fulfilling America\'s normal demand for face masks and respirators, they are now struggling to keep up.\n\nMike Bowen, the company\'s executive vice president, told CBS News that they now field orders of up to 100 million face masks and respirators a day. He also noted that while the company does not ship its products internationally, in the last 30 days it has sold between 1 million and 2 million masks to buyers who then sent them to others in China and Hong Kong.\n\nThis huge spike in personal orders is precisely what experts fear will cause a dangerous inventory shortage in American hospitals — a shortage that is entirely avoidable, given that there are no proven benefits to the general public wearing masks.\n\nThe best way to prevent coronavirus: Wash your hands\n\nThe right way to wash your hands\n\nExperts say washing your hands is the best way to prevent the spread of infectious illnesses like coronavirus. That\'s because one of the most common ways infections spread is when people touch a contaminated surface and then touch their mouth or nose.\n\nWash your hands frequently and thoroughly. CBS News chief medical correspondent Dr. Jon LaPook points out that it\'s especially important to make sure that you scrub the soap into your fingertips because they are simultaneously the part of the hand most often neglected and the part of the hand most likely to touch your face and spread disease.\n\nSoap and water is far more effective than hand sanitizer. If you\'re using an alcohol-based hand sanitizer, you should make sure that it contains at least 60% alcohol.\n\nBeyond that, the CDC advises that, whenever possible, you should also avoid touching your eyes, nose and mouth with unwashed hands, avoid contact with sick people, cover your mouth when you cough and sneeze, and disinfect objects and surfaces frequently.'

In [190]:
dataset_dict2 = {
    'question': ['what is your name?'],
    'context': ['hello my name is Janet thank you'],
    'answer': ['Janet'],
    'answer_start': [0]
}
answer = dataset_dict['answer'][0] 
context = '   Janet'
indicies = []



Janet


In [244]:
data = pd.read_pickle('Documents/CS224N/Saved-You-A-Click-CS224N/data_full_pandas.pkl')

In [243]:
data.shape

(2385, 5)

In [247]:
data = data.dropna()


In [249]:
list_of_articles= data['article'].values.tolist()
list_of_answers = data['answer'].values.tolist()


In [304]:
list_of_questions = data['teaser'].values.tolist()
list_of_contexts = data['article'].values.tolist()



In [278]:
list_of_answers2 = []
for a in list_of_answers:
    if '.' in a:
        for i in range(len(a)):
            if a[i] =='.':
                b = a[:i]
                list_of_answers2.append(b)   
                break
    else:
        list_of_answers2.append(a)   

In [318]:
answers_dict = []
i = 0
for a in list_of_answers:
    d = {}
    d['text']=[a]
    d['answer_start'] = [ind2[i]]
    answers_dict.append(d)
    i+=1

In [303]:
answers_dict

[{'answer': 'It would mean the person who gets the most votes wins',
  'answer_start': -1},
 {'answer': 'This is a piece of shit article. The wife’s great aunt put wine glasses, an aromatic bath kit, and a couple of notes saying, “Go to dinner” and “Buy flowers.” Suck a dick with this garbage',
  'answer_start': -1},
 {'answer': 'Makes a viral social media post, wins full custody of daughter, father and girlfriend suspended from their firefighter job, under investigation for abuse, daughter raises $3k for hair, gets wig. (33 clicks saved)',
  'answer_start': -1},
 {'answer': 'Because Mexican government says breweries are among non-essential businesses which must close',
  'answer_start': -1},
 {'answer': 'Their actors used to date and it ended badly.',
  'answer_start': -1},
 {'answer': "They're making changes to their browser which will block ad-blocking extensions from blocking ads to everyone except enterprise users",
  'answer_start': -1},
 {'answer': "Still don't know after 201 cl

In [293]:
ind = []
for i in range(0,len(list_of_articles)):
    ind.append(findIndices(list_of_answers[i],list_of_articles[i]))

In [294]:
ind2 = []
for i in range(0,len(list_of_articles)):
    ind2.append(findIndices(list_of_answers2[i],list_of_articles[i]))

In [297]:
num = 0
j = 0
for i in ind2:
    if i!=-1:
        num +=1
    j+=1
print(num)

258


In [253]:
def findIndices(answer,context):
    indicies = []
    if answer in context:
        for i in range(len(context)):
            if answer == context[i:i+len(answer)]:
                indicies.append(i)
    else:
        indicies.append(-1)
    return indicies[0]

In [None]:
list_of_questions = data['teaser'].values.tolist()
list_of_contexts

In [310]:
dataset_dict = {
    'question': list_of_questions,
    'context': list_of_contexts,
    'answer': answers_dict
}

In [346]:
# make reduced version of dataset_dict:
questions_reduced = []
contexts_reduced = []
answers_reduced = []
id_reduced = []
for i in range(len(list_of_questions)):
    if answers_dict[i]['answer_start'][0]!=-1:
        questions_reduced.append(list_of_questions[i])
        contexts_reduced.append(list_of_contexts[i])
        answers_reduced.append(answers_dict[i])
        id_reduced.append(i)
            
dataset_dict_reduced = {
    'question': questions_reduced,
    'context': contexts_reduced,
    'answer': answers_reduced,
    'id': id_reduced
}


In [17]:
a1 = ['hello this is a fox']
b1 = ['hello this is a rabbit']

In [18]:
a = a1[0].split(' ')
b = b1[0].split(' ')

In [19]:
len(list(set(a) & set(b)))/max(len(a),len(b))

0.8

In [366]:
N = len(questions_reduced)
import numpy as np

In [390]:
inds = np.arange(0,N)
np.random.seed(42)
np.random.shuffle(inds)

In [396]:
# convert to numpy
questions_reduced_np = np.array(questions_reduced)
contexts_reduced_np = np.array(contexts_reduced)
answers_reduced_np = np.array(answers_reduced)
id_reduced_np = np.array(id_reduced)

In [398]:
questions_reduced_np_shuffled = questions_reduced_np[inds]
contexts_reduced_np_shuffled = contexts_reduced_np[inds]
answers_reduced_np_shuffled = answers_reduced_np[inds]
id_reduced_np_shuffled = id_reduced_np[inds]

In [410]:
train_size = 200
val_size = 30
test_size = 28
dataset_dict_train = {
    'question': list(questions_reduced_np_shuffled[0:train_size]),
    'context': list(contexts_reduced_np_shuffled[0:train_size]),
    'answer': list(answers_reduced_np_shuffled[0:train_size]),
    'id': list(id_reduced_np_shuffled[0:train_size])
}
dataset_dict_val = {
    'question': list(questions_reduced_np_shuffled[train_size:train_size+val_size]),
    'context': list(contexts_reduced_np_shuffled[train_size:train_size+val_size]),
    'answer': list(answers_reduced_np_shuffled[train_size:train_size+val_size]),
    'id': list(id_reduced_np_shuffled[train_size:train_size+val_size])
}
dataset_dict_test = {
    'question': list(questions_reduced_np_shuffled[train_size+val_size:]),
    'context': list(contexts_reduced_np_shuffled[train_size+val_size:]),
    'answer': list(answers_reduced_np_shuffled[train_size+val_size:]),
    'id': list(id_reduced_np_shuffled[train_size+val_size:])
}

In [12]:
dataset_test['id'] = [str(i) for i in dataset_test['id']]

In [11]:
dataset_val['id'] = [str(i) for i in dataset_val['id']]

In [10]:
dataset_train['id'] = [str(i) for i in dataset_train['id']]

In [20]:
with open('dataset_dict_train.pickle', 'wb') as handle:
    pickle.dump(dataset_train, handle)
with open('dataset_dict_val.pickle', 'wb') as handle:
    pickle.dump(dataset_val, handle)
with open('dataset_dict_test.pickle', 'wb') as handle:
    pickle.dump(dataset_test, handle)

In [9]:
with open('dataset_dict_train.pickle', 'rb') as handle:
    dataset_train = pickle.load(handle)
with open('dataset_dict_val.pickle', 'rb') as handle:
    dataset_val = pickle.load(handle)
with open('dataset_dict_test.pickle', 'rb') as handle:
    dataset_test = pickle.load(handle)

In [19]:
with open('dataset_dict_train.pickle', 'wb') as handle:
    pickle.dump(dataset_dict_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('dataset_dict_val.pickle', 'wb') as handle:
    pickle.dump(dataset_dict_val, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('dataset_dict_test.pickle', 'wb') as handle:
    pickle.dump(dataset_dict_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

NameError: name 'dataset_dict_train' is not defined

In [22]:
with open('dataset_dict_test_reddit.pickle', 'rb') as handle:
    dataset_reddit_test = pickle.load(handle)

In [33]:
answ = []
for i in range(len(dataset_reddit_test['answer'])):
    answ.append(dataset_reddit_test['answer'][i]['text'][0])
len(answ)

250

In [40]:
df_reddit_test = pd.DataFrame()
df_reddit_test['question'] = dataset_reddit_test['question']
df_reddit_test['context'] = dataset_reddit_test['context']
df_reddit_test['answer'] = answ
df_reddit_test['answer_user'] = user_asnwer
df_reddit_test['id'] = dataset_reddit_test['id']

In [38]:
answ

['but guacamole',
 'water.',
 '“I don’t know,” he says',
 'is Natasha',
 '"Sayonara, baby"',
 "We don't have a date yet.",
 'on opening credits.',
 'just having fun',
 'Cuttlefish eyes have well-developed depth perceptionTo execute the study',
 'The RPC and NPC Belarus will participate as neutrals at the Beijing 2022 Paralympic Winter Games under the following',
 'Spirit of the North: Enhanced Edition is coming to Xbox Series X and Xbox Series S sometime in early 2021',
 'the building across from Henry Law Park Adventure Playground in Dover.',
 '78 degrees',
 'The legislation would ensure men were penalised for ejaculating outside a vagina',
 'with diet soda',
 'it lived in the ',
 'recognize that they aren’t enemies, and team up.',
 'turn your body and your face away at a 45-degree angle and smile',
 'likely stems from his inability to admit that his instincts are ever wrong.',
 'American cars don’t sell in Japan',
 'not Starz.',
 'poop',
 'Limitless',
 'drink water',
 'her escort to 

In [39]:
user_asnwer

["It's guacomole.",
 'Water.',
 '“I don’t know,” he says',
 "It's Natasha",
 '"Sayonara, Baby"',
 "They don't know the release date yet.",
 'No opening credits.',
 "They're just having fun",
 'Cuttlefish eyes have well-developed depth perception and will react to their surroundings',
 '"The RPC and NPC Belarus will participate as neutrals at the Beijing 2022 Paralympic Winter Games. They will compete under the Paralympic flag and not be included in the medal table."',
 'Spirit of the North: Enhanced Edition is coming to Xbox Series X and Xbox Series S sometime in early 2021',
 'A whale tail sculpture named “Marti” on a roof across from Henry Law Park Adventure Playground in Dover, NH.',
 '78 degrees',
 'The legislation would ensure men were penalised for ejaculating outside a vagina',
 'it’s diet soda',
 'It lived in the sea',
 'Neither, they would recognize that they aren’t enemies, and team up.',
 'Turn your body and your face away at a 45-degree angle and smile',
 '...likely stems f

In [37]:
user_asnwer = ["It's guacomole.",
 'Water.',
 '“I don’t know,” he says',
 "It's Natasha",
 '"Sayonara, Baby"',
 "They don't know the release date yet.",
 'No opening credits.',
 "They're just having fun",
 'Cuttlefish eyes have well-developed depth perception and will react to their surroundings',
 '"The RPC and NPC Belarus will participate as neutrals at the Beijing 2022 Paralympic Winter Games. They will compete under the Paralympic flag and not be included in the medal table."',
 'Spirit of the North: Enhanced Edition is coming to Xbox Series X and Xbox Series S sometime in early 2021',
 'A whale tail sculpture named “Marti” on a roof across from Henry Law Park Adventure Playground in Dover, NH.',
 '78 degrees',
 'The legislation would ensure men were penalised for ejaculating outside a vagina',
 'it’s diet soda',
 'It lived in the sea',
 'Neither, they would recognize that they aren’t enemies, and team up.',
 'Turn your body and your face away at a 45-degree angle and smile',
 '...likely stems from his inability to admit that his instincts are ever wrong.',
 "American carmakers don't try to sell in Japan",
 "it's Starz.",
 'Poop',
 'Limitless',
 'Drink water',
 'She asked him to escort her to her JROTC ball',
 'Breakups.',
 'They were not that close, she was wrapped up in her own life.',
 "The game was only listed at 19 on Famitsu's most wanted games list",
 'October',
 '"You should probably find a shelter that is made of thick brick and has no windows, kind of like a bomb shelter." It literally tells you to hide in a bomb shelter.',
 'she washes towels after every use.',
 'It was Mr Mime.',
 'Tales From Earthsea',
 'Ground beef, Kroger',
 'Seek out and connect with people who can open doors.',
 'It’s called interest rate.',
 'In 1988 he made a joke that he would like to be reincarnated as "a deadly virus, to contribute something to solving overpopulation”',
 '‘how are you?’',
 'He said “Don’t tell me how to be funny.”',
 "They don't know",
 'His Wife',
 'He has ruled against Trump and his allies in the past.',
 "It's Chicago West",
 "What was really exciting is that it is a new species that has never infected people before. It's a cattle worm that somehow jumped into a human.",
 "You'd have $9,222.50",
 '+60% more than their parents',
 '"The style standards are a result of longstanding requirements that female reporters not only do their jobs, but “fulfill larger audience expectations of what women are supposed to look like”',
 '(Variation of) Thanks.',
 'Tea might affect how DNA is expressed and women drinking tea was "associated with epigenetic changes in 28 different gene regions known to interact with cancer or estrogen metabolism." [TIME]',
 'Disney banned smoking in its films around 2007',
 'The global fight against disease',
 'I Disagree',
 'Send his mother to Mars and bring her back alive.',
 'Auto manufacturers must give the same tools to most third parties as they do with dealerships',
 'Eat at least 2 cups of vegetables every day.',
 'No.',
 'Paddington 2',
 'Any of them',
 '“It’s not a hard no, but it’s not an eager yes either.”',
 'The role is Jack Reacher',
 'He dumpster dives.',
 'Indiana, South Carolina, Tennessee, and Virginia',
 '#1 is John Malone with 2.2 million acres',
 'Mosquitoes',
 'Before the end of the year.',
 'Cargo House',
 '“a bad call from a doctor or something”',
 'Alabama, Georgia, Texas, Florida and Arkansas',
 'eShop music',
 'Recount your "love at first sight" moment.',
 'Nien Nunb, the character who flew the Millennium Falcon with Lando Calrissian during the Death Star attack in Return of the Jedi',
 '51, referring to the percent of voters that said they would prefer a Congress controlled by Democrats in 2021',
 'Tub is only half full',
 "It's Natasha",
 'She had triplets',
 '$2800',
 'ai.type, by "ai.type LTD” ... reported to be “delivering millions of invisible ads and fake clicks [and] real user data about views, clicks and purchases to different ad networks.”',
 'People are calling her a hypocrite for being a feminist and posing braless in "Vanity Fair"',
 'No in-display fingerprint sensor',
 'is giant ovarian cyst',
 "He's retired",
 '"People should know everything about each other before they get married"',
 'Our teeth don’t fit because they evolved instead to match the longer jaw that would develop in a more challenging strain environment. Ours are too short because we don’t give them the workout nature expects us to.',
 '35-44',
 "It's called box breathing or four-square breathing. Here's how it works, 1. Breathe in for four seconds 2. Hold air in your lungs for four seconds 3. Exhale for four seconds 4. Hold your breath, lungs emptied, for four seconds.",
 '"Johanna, 29, suffers from a rare auto-immune disease, which means her body has a life-threatening allergic reaction to almost everything and everyone. Including her husband."',
 'Leslie Grantham, who played Dennis "Dirty Den" Watts',
 'It is real. Cobb is not in a dream, he did make it back home to his family.',
 "We don't know",
 'She was the only competitor in her category.',
 'Vitamins D and C.',
 "If only you're conducting a driving lesson as an instructor.",
 'She was Hannah Montana',
 'The Samsung Galaxy Note 10',
 '"give users a choice"',
 'You can have too much of a good thing',
 'Bonds',
 '"Who are you?"',
 '“Don’t forget the heart"',
 '1st Jan at 9pm on Channel 4',
 '"We’re not prepared to go in hot-zone extraction. That’s just not what we do. It was active fire, active shooting."',
 'Vanderpump Rules',
 '"Eclipse Headache"',
 'Saffir-Simpson Hurricane Wind Scale Only Goes From 1 to 5',
 'Eclipse.',
 "The Octopus crept up to the man's boot. Then, it placed two tentacles on his boot.",
 'Riding a horse, fast',
 'Whopper is being removed from 2 for $5 menu',
 'Use longer passwords',
 'The word is "ouch"',
 '"Marijuana is not medicine" [The Motley Fool]',
 'It was for pocket watches.',
 'Anatomical characteristics of their brains (size, shape)',
 'Phishing',
 'The number 222 bus was going to Tooting, but in real life it goes to Hounslow',
 "It's a rock",
 'Open the curtains',
 'Investing in yourself',
 'A 2020 study found that men who drank at least one cup of coffee per day were 15% less likely to experience hearing loss than men who drank less than a cup a day',
 'Layers',
 "It's Hal Jordan",
 'Because with that move, Apple "is dispensing of the notion that it forces people into buying new models"',
 'Howard The Duck',
 'It is unclear',
 'They have both matured and are on the same page',
 '25,000 professionals signed "We, the undersigned mental health professionals, believe in our professional judgment that Donald Trump manifests a serious mental illness."',
 'Too much variety, too many choices',
 'Her name was Qur\'stylle, pronounced "Crystal".',
 'Only if you exceed the storage limit for 2 whole years.',
 'To break the Guinness World Record for largest underwater mermaid show',
 'Totally normal',
 '"Actually, you are not able to download Fortnite without Epic Games launcher"',
 'Giving parents money',
 'Looking after his family',
 '"What\'s the deal with all these f***ing soap people?" - Caitlyn Jenner, "Oi!" - Jacqueline Jossa',
 '"To get things done, you have to do"',
 'Internet Explorer is a compatibility solution',
 'Hillary Clinton',
 "It's “The Lord of the Rings: The Rings of Power.”",
 'MyFitnessPal',
 'To get paid and get free products',
 'It doesn’t as the difference lies in the type of games in which players earned their winnings.',
 'Quote from Kevin James: “I think if they can use me to get their show made, and it’s a great show, God bless them, good for them.”',
 '"We can reveal it looks exactly the same."',
 'It was a piece of metal',
 'Scots cannot apply - must be US citizen aged between 30 and 55 and fluent in both English and Russian.',
 'No it won’t. It’s only making a “close approach”.',
 'Switzerland.',
 'A port of Skyward Sword',
 '"The whole debacle could have been avoided if only the series had had in place a rule stating that, if a contestant can’t go on, they’ve gotta go. Done. Finito."',
 'no laptops, no cellphones',
 'Elizabeth Olsen',
 'Sony’s Universe of Marvel Characters',
 '"but"',
 'A baby tapeworm in the brain.',
 '"I haven\'t seen it completed. [...] What I\'ve seen of the film I really liked."',
 'Study shows they get depressed and lethargic',
 'water',
 'in 2035',
 '$2,800',
 'first female royal to benefit from succession law change that ensures girls will not be overtaken by any future younger brothers',
 'introspection',
 'Upper atmosphere lightning',
 '"Bondmaid," which means "a slave girl."',
 'Alice in Tim Burton’s Alice in Wonderland',
 'Using multiple exclamation points',
 '$1636',
 '"I have no special talents. I am only passionately curious." The trait is curiosity.',
 'After 65, avoid wearing a scent that is too sweet.',
 "Including real time statistics about a post's popularity, shares, and interest in its news feed algorithm",
 'send his mother to Mars and brings her back alive.',
 '18 Chinese makers of polyurethane foam insulation, which is generally used in construction.',
 'North Dakota',
 'Basically to have the guts to ask for something "Most people never pick up the phone and call. Most people never ask"',
 'The second beverage service',
 '$48 million',
 '"Some people aren’t meant to be here a long time."',
 'she had a "disorder caused by hepatitis C known as Type 2 mixed cryoglobulinemia."',
 "It's not ready yet.",
 'Andrew Yang',
 'He complimented him about never turning in a bad performance.',
 'Bike messenger',
 'Loki is Bi',
 'Wegmans',
 'Go to bed a little smarter each day',
 '[We know nothing about it, more research needs to be done]',
 'Empathy',
 'It mess with your metabolism that cause insulin intolerance, diabetes, and weight gain.',
 "Billie Eilish Pirate Baird O'Connell",
 'Pennsylvania, Georgia, Michigan, Wisconsin and Oregon',
 'Ivanka Trump ‘We don’t want any more inexperienced Trumps in the White House’ [17 clicks]',
 "They ran out of story ideas and didn't want to compromise the quality.",
 'Spread a layer of mayonnaise on one slice of bread and peanut butter on the other. Press the sandwich together to serve.',
 'Maybe Amy Klobuchar',
 'From watching Peppa Pig',
 'Gentrification',
 "He's a Spirit",
 'He still needs an "encroachment permit".',
 "There wasn't enough space.",
 'You must return a third stimulus check if it was mailed to someone who died before 2021.',
 "22.08 $/h. It does not include things like paying off debts, homeownership, saving for your children's education or any other type of emergency fund.",
 'Gravity is "emergent", not always there. Comes into existence from changes in microscopic bits of information in the structure of spacetime',
 'Quote from article “Per the show rep, however, the crutches won’t show up on screen.”',
 'No. The volume of vapour and particles released is far below what is considered harmful to your health.',
 'r/the_donald and /pol/',
 'XCloud.',
 'Jay’s ex-wife DeDe',
 'Warm water that usually stays deep in the ocean is coming closer to the surface, melting the ice from underneath',
 'Increased creative collaboration between PlayStation and Sony Music that could lead to more licensed music in first party games',
 'New Super Mario Bros U Deluxe',
 'It was Brittney Spears [24 Clicks]',
 "You can't eat whatever you want.",
 'He just made friends with people in his class and worked harder. Literally says in the article "If you came to this article looking for a strategy on a near perfect GPA, I’m sorry to disappoint you, but I don’t have one."',
 'No, it will hit the far side',
 'Artificial intelligence, energy, or biosciences',
 '"I am grateful and happy"',
 '"Whisper"',
 'Wailord comes back, in the Workout Eea, even if caught',
 '`"Corona," the fictitious land where Rapunzel is confined in the Disney movie`',
 '9 p.m.',
 'Picture a calm scene and repeat the phrase “Don’t think” for 10 seconds',
 'He didn’t have insurance',
 'No cloud saves',
 "It's a fangtooth snake-eel",
 'A large nuclear exchange would not only kill millions of people and contaminate wast areas with radioactive fallout but potentially also have longer-term climatic effects.',
 'From Mojang to Mojang Studios',
 'Its the Vice President to congratulate NASA',
 'Clickbait is a sensationalized headline that encourages you to click a link to an article, image, or video.',
 'Polaris',
 'New England Patriots or Green Bay Packers',
 'She was filming another Netflix series “cursed” and had no time.',
 'The holy trinity',
 '"I am excited."',
 'She watches Breaking Bad',
 'He hires employees that “wake up every morning terrified.”',
 'a 39-year-old UK web designer named James Linton.',
 'Philip K Dick',
 'Ellie Kemper auditioned, but wasn’t chosen.',
 'the best place to pet a dog is under the chin',
 "Fenty is Rihanna's last name",
 'Crushed up KitKat wafers',
 'in 2035',
 '15 a combination of British security detail and Canadian Mountees.',
 'Danville',
 "It's Bob Dylan",
 'If Obama were to defend his legacy it would only work to give Trump an enemy to attack and rile up his supporters.',
 '"I don\'t knwo. He was probably yelling some shit."',
 "it's the niece of his ex-wife",
 'she asked about salary and benefits',
 'Cleaning. (Irritation from chemicals, including ammonia, on mucous membranes lining airways is the key)']

In [41]:
df_reddit_test.to_csv('test_reddit2.csv')

In [42]:
df_reddit_test

Unnamed: 0,question,context,answer,answer_user,id
0,COVID-19 won't disrupt people from consuming t...,Super Bowl celebrations may look a lot differe...,but guacamole,It's guacomole.,2424
1,"What To Drink First Thing In The Morning, Acco...",You’ve finally stopped hitting the snooze butt...,water.,Water.,1603
2,See What Mark Hoppus Had To Say About Ton Delo...,"Since coronavirus lockdown began, blink-182‘s ...","“I don’t know,” he says","“I don’t know,” he says",160
3,The Internet Can’t Believe That Sasha Obama’s ...,We don’t know what to believe anymore…Barack O...,is Natasha,It's Natasha,1704
4,Here’s What The Spanish Terminator Says Instea...,It's doubtful Arnold Schwarzenegger knew how i...,"""Sayonara, baby""","""Sayonara, Baby""",956
...,...,...,...,...,...
245,Why Obama stays quiet as Trump attacks him,"Washington (CNN) With each passing day, Presid...","way, it would only work to give Trump an enemy...",If Obama were to defend his legacy it would on...,1638
246,Former Champ Cody Garbrandt Opens Up About Wha...,﻿﻿Cody Garbrandt came into Saturday night’s UF...,don’t knwo. He was probably yelling some sh*t.,"""I don't knwo. He was probably yelling some sh...",1095
247,Brazil striker Hulk announces 'niece' is pregn...,Brazil striker Hulk holds hand to his ear afte...,- the niece of his ex-wife,it's the niece of his ex-wife,1130
248,The reason this woman’s job interview was canc...,As if the list of problems with work culture l...,asked about the salary and benefits,she asked about salary and benefits,1294


In [359]:
with open('dataset_dict_train.pickle', 'rb') as handle:
            dataset_dict = pickle.load(handle)with open('dataset.pickle', 'wb') as handle:
    pickle.dump(dataset_dict_reduced, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
import pickle5 as pickle

In [333]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/deepset/roberta-base-squad2/resolve/main/config.json from cache at /Users/oliver/.cache/huggingface/transformers/c40d0abb589629c48763f271020d0b1f602f5208c432c0874d420491ed37e28b.122ed338b3591c07dba452777c59ff52330edb340d3d56d67aa9117ad9905673
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",


In [347]:
dataset_dict = dataset_dict_reduced
tokenized_examples = tokenizer(dataset_dict['question'],
                               dataset_dict['context'],
                               truncation="only_second",
                               #stride=128,
                               #max_length=384,
                               return_overflowing_tokens=True,
                               return_offsets_mapping=True,
                               padding='max_length')
sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
offset_mapping = tokenized_examples["offset_mapping"]

# Let's label those examples!
tokenized_examples["start_positions"] = []
tokenized_examples["end_positions"] = []
tokenized_examples["id"] = []
inaccurate = 0
for i, offsets in enumerate(tqdm(offset_mapping)):
    # We will label impossible answers with the index of the CLS token.
    input_ids = tokenized_examples["input_ids"][i]
    cls_index = input_ids.index(tokenizer.cls_token_id)

    # Grab the sequence corresponding to that example (to know what is the context and what is the question).
    sequence_ids = tokenized_examples.sequence_ids(i)

    # One example can give several spans, this is the index of the example containing this span of text.
    sample_index = sample_mapping[i]
    answer = dataset_dict['answer'][sample_index]
    # Start/end character index of the answer in the text.
    print(answer['answer_start'])
    start_char = answer['answer_start'][0]
    end_char = start_char + len(answer['text'][0])
    tokenized_examples['id'].append(dataset_dict['id'][sample_index])
    # Start token index of the current span in the text.
    token_start_index = 0
    while sequence_ids[token_start_index] != 1:
        token_start_index += 1

    # End token index of the current span in the text.
    token_end_index = len(input_ids) - 1
    while sequence_ids[token_end_index] != 1:
        token_end_index -= 1

    # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
    if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
        tokenized_examples["start_positions"].append(cls_index)
        tokenized_examples["end_positions"].append(cls_index)
    else:
        # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
        # Note: we could go after the last offset if the answer is the last word (edge case).
        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
            token_start_index += 1
        tokenized_examples["start_positions"].append(token_start_index - 1)
        while offsets[token_end_index][1] >= end_char:
            token_end_index -= 1
        tokenized_examples["end_positions"].append(token_end_index + 1)
        # assertion to check if this checks out
        context = dataset_dict['context'][sample_index]
        offset_st = offsets[tokenized_examples['start_positions'][-1]][0]
        offset_en = offsets[tokenized_examples['end_positions'][-1]][1]
        if context[offset_st : offset_en] != answer['text'][0]:
            inaccurate += 1

100%|███████████████████████████████████████| 605/605 [00:00<00:00, 5394.56it/s]

[934]
[934]
[1696]
[1696]
[1696]
[269]
[269]
[76]
[76]
[76]
[76]
[76]
[76]
[1201]
[1201]
[1201]
[230]
[408]
[408]
[1713]
[1713]
[658]
[658]
[658]
[4]
[4]
[1116]
[1116]
[1116]
[1116]
[1553]
[4413]
[4413]
[4413]
[4413]
[1399]
[546]
[546]
[546]
[146]
[1474]
[1474]
[1474]
[1474]
[0]
[0]
[465]
[336]
[336]
[697]
[697]
[697]
[697]
[714]
[714]
[714]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[31]
[29]
[29]
[1421]
[1421]
[945]
[945]
[439]
[439]
[439]
[1007]
[1007]
[211]
[211]
[211]
[876]
[876]
[876]
[876]
[60]
[60]
[60]
[224]
[224]
[216]
[216]
[356]
[356]
[1084]
[465]
[1396]
[1035]
[1035]
[1035]
[1251]
[1251]
[408]
[799]
[761]
[761]
[397]
[397]
[397]
[214]
[214]
[593]
[593]
[593]
[648]
[648]
[6]
[2822]
[2822]
[2822]
[1013]
[1013]
[1120]
[1120]
[602]
[602]
[124]
[124]
[876]
[876]
[876]
[972]
[972]
[972]
[3005]
[3005]
[3005]
[86]
[86]
[86]
[86]
[86]
[175]
[175]
[175]
[175]
[1569]
[1569]
[1569]
[77]
[77]
[344]
[344]
[1307]
[1307]
[1307]
[542]
[54




In [342]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer1 = AutoTokenizer.from_pretrained(model_checkpoint)

https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /Users/oliver/.cache/huggingface/transformers/tmpxv7ikn31


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json in cache at /Users/oliver/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
creating metadata file for /Users/oliver/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
https://huggingface.co/bert-base-cased/resolve/main/config.json not found in cache or force_download set to True, downloading to /Users/oliver/.cache/huggingface/transformers/tmpbkzxwh40


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-cased/resolve/main/config.json in cache at /Users/oliver/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
creating metadata file for /Users/oliver/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /Users/oliver/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropou

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-cased/resolve/main/vocab.txt in cache at /Users/oliver/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
creating metadata file for /Users/oliver/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /Users/oliver/.cache/huggingface/transformers/tmp03kopxcu


Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json in cache at /Users/oliver/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6
creating metadata file for /Users/oliver/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6
loading file https://huggingface.co/bert-base-cased/resolve/main/vocab.txt from cache at /Users/oliver/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json from cache at /Users/oliver/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6

In [350]:
len(tokenized_examples['start_positions'])

605

In [351]:
tokenized_examples.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions', 'id'])

In [None]:
train_dataset = raw_datasets["train"].map(
    tokenized_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForQuestionAnswering
from tqdm.auto import tqdm

In [354]:
tokenized_examples.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions', 'id'])

In [355]:
tf_train_dataset = tokenized_examples.to_tf_dataset(
    columns=[
        "input_ids",
        "start_positions",
        "end_positions",
        "attention_mask",
        "id",
    ],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

AttributeError: 

In [352]:
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [344]:
context_1 = dataset_dict["context"][0]
question_1 = dataset_dict["question"][0]

inputs_1 = tokenizer(question_1, context_1)
tokenizer.decode(inputs_1["input_ids"])

'<s>Did 60% of Californian\'s get corona virus in December and mysteriously not die or go to the hosptial?</s></s>SAN FRANCISCO -- It\'s a story that has been making headlines across the United States: Did Californians develop herd immunity after possibly being exposed to novel coronavirus last year?On Thursday, a fellow at Stanford\'s Hoover Institute released the theory suggesting that what was perceived to be a nasty and early flu season in California was actually COVID-19 spreading undetected throughout the community.Our sister station KGO-TV in San Francisco sat down with Dr. Alok Pate l for more insight on what herd immunity means and what it would take for it to happen in California and the United States."Herd immunity is basically a percentage of the entire community who would need to be immune to an infection so the infection couldn\'t spread from person to person. And so when we look at coronavirus, we talked about that number around two to three, that\'s a reproductive numbe

In [4]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [287]:
inputs = "I'm excited to learn about Hugging Face Transformers!"
tokenized_inputs = tokenizer(inputs, return_tensors="pt")

In [84]:
from datasets import load_dataset, DatasetDict

In [288]:
tokenized_inputs

{'input_ids': tensor([[    0,   100,   437,  2283,     7,  1532,    59, 30581,  3923, 12346,
         34379,   328,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [86]:
imdb_dataset = load_dataset("imdb")

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /Users/oliver/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /Users/oliver/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [87]:
from datasets import Dataset


In [138]:
imdb_dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [99]:
data2 = data.drop(['title','url'], axis=1)

In [102]:
data2.reset_index()

Unnamed: 0,index,teaser,answer,article
0,0,What Getting Rid of the Electoral College woul...,It would mean the person who gets the most vot...,"(CNN) On Monday night, in a CNN town hall in J..."
1,1,This Couple Waited 10 Years Before Opening The...,This is a piece of shit article. The wife’s gr...,"Squats are an essential part of every workout,..."
2,3,After Girl’s Dad Cuts Her Hair Off For Getting...,"Makes a viral social media post, wins full cus...",Is there anything more important than a birthd...
3,5,Corona beer stops production,Because Mexican government says breweries are ...,New York (CNN Business) Production of Corona b...
4,6,There’s a pretty crazy reason why Game Of Thro...,Their actors used to date and it ended badly.,Who wants to hear some Game Of Thrones-based g...
...,...,...,...,...
2781,13359,Flying this summer? 8 things to know if you ha...,List in post,CLOSE Family trips don't always have to be exp...
2782,13362,‘The Simpsons’ predicted this ‘Game of Thrones...,Daenerys burning down King's Landing,Nostradamus hasn’t got anything on America’s f...
2783,13363,Was Syracuse man ‘executed’ over this vulgar i...,Suck my d*ick,"Syracuse, NY -- Two old friends who committed ..."
2784,13364,Uber and Lyft drivers reveal the most annoying...,List in comments,"Every day, an army of nearly 4 million drivers..."


In [103]:
dataset = Dataset.from_pandas(data2,preserve_index=False)

In [106]:
dataset

Dataset({
    features: ['teaser', 'answer', 'article'],
    num_rows: 2786
})

In [None]:
DatasetDict(
    train=imdb_dataset['train'].shuffle(seed=1111).select(range(128)).map(truncate),
    val=imdb_dataset['train'].shuffle(seed=1111).select(range(128, 160)).map(truncate),
)

In [6]:
DatasetDict(
    train=imdb_dataset['train'].shuffle(seed=1111).select(range(128)).map(truncate),
    val=imdb_dataset['train'].shuffle(seed=1111).select(range(128, 160)).map(truncate),
)

{'input_ids': tensor([[    0,   100,   437,  2283,     7,  1532,    59, 30581,  3923, 12346,
         34379,   328,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
outputs = model(**tokenized_inputs)

In [129]:
tokenized_dataset = dataset
for name in ['teaser','article','answer']:
    tokenized_dataset = tokenized_dataset.map(
        lambda example: tokenizer(example[name], padding=True, truncation=True),
        batched=True,
        batch_size=16
    )
    tokenized_dataset = tokenized_dataset.remove_columns([name])

  0%|          | 0/175 [00:00<?, ?ba/s]

  0%|          | 0/175 [00:00<?, ?ba/s]

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [131]:
tokenized_dataset[0:1]

{'answer': ['It would mean the person who gets the most votes wins'],
 'article': ['(CNN) On Monday night, in a CNN town hall in Jackson, Mississippi, Sen. Elizabeth Warren made some news: She supports getting rid of the electoral college as the method by which we elect presidents.\n\n"My view is that every vote matters and the way we can make that happen is that we can have national voting and that means get rid of the Electoral College -- and every vote counts," the Massachusetts Democrat said, to raucous applause from the audience (The Democratic base is very much up in arms over the Electoral College, after Donald Trump won the White House in 2016 despite losing the popular vote by almost 2.9 million votes to Hillary Clinton).\n\nHow realistic is what Warren is proposing? And how much -- really -- would it change how candidates campaign for the nation\'s top job? To get some answers, I reached out to Sanford Levinson, a constitutional law expert and professor at the University of T

In [None]:
num_epochs = 3
num_training_steps = 3 * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

best_val_loss = float("inf")
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    # training
    model.train()
    for batch_i, batch in enumerate(train_dataloader):
        
        output = model(**batch)
        
        optimizer.zero_grad()
        output.loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
    
    # validation
    model.eval()
    for batch_i, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            output = model(**batch)
        loss += output.loss
    
    avg_val_loss = loss / len(eval_dataloader)
    print(f"Validation loss: {avg_val_loss}")
    if avg_val_loss < best_val_loss:
        print("Saving checkpoint!")
        best_val_loss = avg_val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': best_val_loss,
            },
            f"checkpoints/epoch_{epoch}.pt"
        )  

In [8]:
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ 0.9731, -7.1161, -7.9177, -7.2378, -8.2986, -8.3323, -8.5380, -6.8447,
         -8.5838, -7.2198, -6.6224, -8.5347, -6.3678]],
       grad_fn=<CloneBackward0>), end_logits=tensor([[ 1.4810, -8.3040, -8.3725, -8.4429, -8.6743, -8.2721, -8.4562, -8.4142,
         -7.6343, -6.7815, -5.2392, -7.7740, -3.4287]],
       grad_fn=<CloneBackward0>), hidden_states=None, attentions=None)

In [11]:
res

{'score': 0.21171453595161438,
 'start': 59,
 'end': 84,
 'answer': 'gives freedom to the user'}