# BERT Question Answering Bot

In [None]:
# verify GPU availability
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# install huggingface libraries
!pip install pytorch-pretrained-bert pytorch-nlp pytorch_transformers



In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertConfig, BertModel
from pytorch_transformers import AdamW, BertForQuestionAnswering
from tqdm import tqdm, trange
import pandas as pd
import io
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [None]:
# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

In [None]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
!ls /drive/My\ Drive/Medium

 cache_train		 train-v2.0.json	   utils_squad.py
'Medium BERT QA.ipynb'	 train-v2.0-Sample.json
 __pycache__		 utils_squad_evaluate.py


In [None]:
import sys
sys.path.append('/drive/My Drive/Medium')

In [None]:
from utils_squad import (read_squad_examples, convert_examples_to_features,
                         RawResult, write_predictions,
                         RawResultExtended, write_predictions_extended)
from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad, plot_pr_curve

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
input_file = '/drive/My Drive/Medium/train-v2.0.json'
examples = read_squad_examples(input_file=input_file,
                                is_training=True,
                                version_2_with_negative=True)

In [None]:
examples[:5]

[qas_id: 56be85543aeaaa14008c9063, question_text: When did Beyonce start becoming popular?, doc_tokens: [Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".], start_position: 39, end_position: 42,
 qas_id: 56be85543aeaaa14008c9065, question_text: What areas did Beyonce compete in when she was growing up?, doc_tokens: [Beyoncé Giselle Knowles-Carter (/biːˈ

In [None]:
train_data = pd.DataFrame.from_records([vars(example) for example in examples])
train_data.head()

Unnamed: 0,qas_id,question_text,doc_tokens,orig_answer_text,start_position,end_position,is_impossible
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,"[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",in the late 1990s,39,42,False
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,"[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",singing and dancing,28,30,False
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,"[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",2003,82,82,False
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,"[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...","Houston, Texas",22,23,False
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,"[Beyoncé, Giselle, Knowles-Carter, (/biːˈjɒnse...",late 1990s,41,42,False


In [None]:
sample = train_data.sample(frac=1).head(1)
context = sample.doc_tokens.values
train_data[train_data.doc_tokens.values==context]

Unnamed: 0,qas_id,question_text,doc_tokens,orig_answer_text,start_position,end_position,is_impossible
42307,570df4f00dc6ce1900204d63,What percentage of adolescents reporting havin...,"[Dating, violence, is, fairly, prevalent, with...",10-45%,10,10,False
42308,570df4f00dc6ce1900204d64,What percentage of adolescents reported experi...,"[Dating, violence, is, fairly, prevalent, with...",a quarter to a third,25,29,False
42309,570df4f00dc6ce1900204d65,"Hitting, throwing objects, and slaps are examp...","[Dating, violence, is, fairly, prevalent, with...",Physical,59,59,False
42310,570df4f00dc6ce1900204d66,"In heterosexual adolescent couples, is there a...","[Dating, violence, is, fairly, prevalent, with...",no,79,79,False
42311,570df4f00dc6ce1900204d67,Does physical aggression decline or increase d...,"[Dating, violence, is, fairly, prevalent, with...",decline,65,65,False


In [None]:
import random
def print_squad_sample(train_data, line_length=14, separator_length=120):
  sample = train_data.sample(frac=1).head(1)
  context = sample.doc_tokens.values
  print('='*separator_length)
  print('CONTEXT: ')
  print('='*separator_length)
  lines = [' '.join(context[0][idx:idx+line_length]) for idx in range(0, len(context[0]), line_length)]
  for l in lines:
      print(l)
  print('='*separator_length)
  questions = train_data[train_data.doc_tokens.values==context]
  print('QUESTION:', ' '*(3*separator_length//4), 'ANSWER:')
  for idx, row in questions.iterrows():
    question = row.question_text
    answer = row.orig_answer_text
    print(question, ' '*(3*separator_length//4-len(question)+9), (answer if answer else 'No awnser found'))

In [None]:
print_squad_sample(train_data)

CONTEXT: 
More-or-less independent circadian rhythms are found in many organs and cells in the body
outside the suprachiasmatic nuclei (SCN), the "master clock". These clocks, called peripheral oscillators, are
found in the adrenal gland,[citation needed] oesophagus, lungs, liver, pancreas, spleen, thymus, and skin.[citation
needed] Though oscillators in the skin respond to light, a systemic influence has not
been proven. There is also some evidence that the olfactory bulb and prostate may
experience oscillations when cultured, suggesting that these structures may also be weak oscillators.[citation needed]
QUESTION:                                                                                            ANSWER:
Where else beside the SCN cells are independent circadian rhythms also found?                        organs and cells
What is the term for the independent clocks?                                                         peripheral oscillators
What is the SCN considered to be in

In [None]:

train_data['paragraph_len'] = train_data['doc_tokens'].apply(len)
train_data['question_len'] = train_data['question_text'].apply(len)
train_data.sample(frac=1).head(5)

Unnamed: 0,qas_id,question_text,doc_tokens,orig_answer_text,start_position,end_position,is_impossible,paragraph_len,question_len
71956,5a1105a106e79900185c34e0,What is not needed to provide power boost to D...,"[The, third, Digimon, series,, which, began, a...",,-1,-1,True,301,52
44029,5ad36f0d604f3c001a3fe226,In what year did Nicolae Ceausescu die?,"[In, 1977,, Elizabeth, marked, the, Silver, Ju...",,-1,-1,True,119,39
38244,5a6135e3e9e1cc001a33d012,What enables people of all ages to participate...,"[General, gymnastics, enables, people, of, all...",,-1,-1,True,88,82
69124,5a7a4b5117ab25001a8a0486,"In ancient Rome, Homer wrote what two epics?","[In, ancient, Greece,, the, epics, of, Homer,,...",,-1,-1,True,141,44
67233,5726a7ef708984140094cd09,What country is the birthplace of the Red Cross?,"[The, establishment, of, the, Swiss, Confedera...",Switzerland,77,77,False,140,48


In [None]:
max_seq_length = 256
print("Percentage of context's less than max_seq_length = %s%%" % (len([l for l in train_data['paragraph_len'] if l <= max_seq_length])/len(train_data) * 100))

Percentage of context's less than max_seq_length = 98.19289589392184%


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
doc_stride = 128
max_seq_length = 256
max_query_length = 64
# batch size of 64 if RAM available.
batch_size = 16

In [None]:
cached_features_file = '/drive/My Drive/Medium/cache_train'

In [None]:
if not os.path.exists(cached_features_file):
  features = convert_examples_to_features(examples=examples,
                                        tokenizer=tokenizer,
                                        max_seq_length=max_seq_length,
                                        doc_stride=doc_stride,
                                        max_query_length=max_query_length,
                                        is_training=True)
  torch.save(features, cached_features_file)
else:
  features = torch.load(cached_features_file)

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                        all_start_positions, all_end_positions,
                        all_cls_index, all_p_mask)

In [None]:
train_sampler = RandomSampler(dataset)
train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=batch_size, drop_last=True)

In [None]:
import glob
checkpoints = sorted(glob.glob('/drive/My Drive/Medium/checkpoint*-[0-9]*'))

In [None]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [None]:
if len(checkpoints) > 0:
  global_step = checkpoints[-1].split('-')[-1]
  ckpt_name = '/drive/My Drive/Medium/checkpoint-{}'.format(global_step)
  print("Loading model from checkpoint %s" % ckpt_name)
  model = BertForQuestionAnswering.from_pretrained(ckpt_name)
  train_loss_set_ckpt = torch.load(ckpt_name + '/training_loss.pt')
  train_loss_set = to_list(train_loss_set_ckpt)
  tr_loss = train_loss_set[-1]
else:
  global_step = 0
  train_loss_set = []
  tr_loss = 0.0
  model = BertForQuestionAnswering.from_pretrained('bert-base-uncased'))

model.cuda()

Loading model from checkpoint /drive/My Drive/Medium/checkpoint-3000


In [None]:
param_optimizer = list(model.named_parameters())
print(param_optimizer[-2])
print(param_optimizer[-1])

('qa_outputs.weight', Parameter containing:
tensor([[ 0.0196, -0.0388, -0.0120,  ...,  0.0055, -0.0216,  0.0114],
        [-0.0309, -0.0284,  0.0185,  ...,  0.0025,  0.0432, -0.0569]],
       device='cuda:0', requires_grad=True))
('qa_outputs.bias', Parameter containing:
tensor([0., 0.], device='cuda:0', requires_grad=True))


In [None]:
learning_rate = 5e-5
adam_epsilon=1e-8
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)

In [None]:
num_train_epochs = 1

print("***** Running training *****")
print("  Num examples = %d" % len(dataset))
print("  Num Epochs = %d" % num_train_epochs)
print("  Batch size = %d" % batch_size)
print("  Total optimization steps = %d" % (len(train_dataloader) // num_train_epochs))

model.zero_grad()
train_iterator = trange(num_train_epochs, desc="Epoch")
set_seed()

for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):
      if step < global_step + 1:
        continue

      model.train()
      batch = tuple(t.to(device) for t in batch)

      inputs = {'input_ids':       batch[0],
                'attention_mask':  batch[1], 
                'token_type_ids':  batch[2],  
                'start_positions': batch[3], 
                'end_positions':   batch[4]}

      outputs = model(**inputs)

      loss = outputs[0]
      train_loss_set.append(loss)
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      tr_loss += loss.item()
      optimizer.step()
      model.zero_grad()
      global_step += 1
    
      if global_step % 1000 == 0:
        print("Train loss: {}".format(tr_loss/global_step))
        output_dir = '/drive/My Drive/Medium/checkpoint-{}'.format(global_step)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        torch.save(torch.tensor(train_loss_set), os.path.join(output_dir, 'training_loss.pt'))
        print("Saving model checkpoint to %s" % output_dir)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/9016 [00:00<?, ?it/s][A

***** Running training *****
  Num examples = 144262
  Num Epochs = 1
  Batch size = 16
  Total optimization steps = 9016


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)

Iteration:   0%|          | 2/9016 [00:01<1:20:41,  1.86it/s][A
Iteration:   0%|          | 3/9016 [00:02<1:39:08,  1.52it/s][A
Iteration:   0%|          | 4/9016 [00:02<1:51:44,  1.34it/s][A
Iteration:   0%|          | 5/9016 [00:03<2:00:56,  1.24it/s][A
Iteration:   0%|          | 6/9016 [00:04<2:07:24,  1.18it/s][A
Iteration:   0%|          | 7/9016 [00:05<2:12:38,  1.13it/s][A
Iteration:   0%|          | 8/9016 [00:06<2:16:24,  1.10it/s][A
Iteration:   0%|          | 9/9016 [00:07<2:19:03,  1.08it/s][A
Iteration:   0%|          | 10/9016 [00:08<2:21:05,  1.06it/s][A
Iteration:   0%|          | 11/9016 [00:09<2:22:49,  1.05it/s][A
Iteration:   0%|          | 12/9016 [00:10<2:24:06,  1.04it/s][A
Iteration:   0%|          | 13/9016 [00:11<2:25:19,  1.03it/s][A
Iteration:   0%|          | 14/9016 [00:12<2:26:18,  1.03it/s][A
Iteration:   0%|      

Train loss: 1.8166966327428817



Iteration:  11%|█         | 1001/9016 [15:09<3:19:48,  1.50s/it][A

Saving model checkpoint to /drive/My Drive/Medium/checkpoint-1000



Iteration:  11%|█         | 1002/9016 [15:10<2:56:13,  1.32s/it][A
Iteration:  11%|█         | 1003/9016 [15:10<2:39:29,  1.19s/it][A
Iteration:  11%|█         | 1004/9016 [15:11<2:27:43,  1.11s/it][A
Iteration:  11%|█         | 1005/9016 [15:12<2:19:43,  1.05s/it][A
Iteration:  11%|█         | 1006/9016 [15:13<2:13:53,  1.00s/it][A
Iteration:  11%|█         | 1007/9016 [15:14<2:09:40,  1.03it/s][A
Iteration:  11%|█         | 1008/9016 [15:15<2:07:11,  1.05it/s][A
Iteration:  11%|█         | 1009/9016 [15:16<2:04:53,  1.07it/s][A
Iteration:  11%|█         | 1010/9016 [15:17<2:03:23,  1.08it/s][A
Iteration:  11%|█         | 1011/9016 [15:18<2:02:26,  1.09it/s][A
Iteration:  11%|█         | 1012/9016 [15:19<2:02:11,  1.09it/s][A
Iteration:  11%|█         | 1013/9016 [15:20<2:03:06,  1.08it/s][A
Iteration:  11%|█         | 1014/9016 [15:20<2:02:20,  1.09it/s][A
Iteration:  11%|█▏        | 1015/9016 [15:21<2:01:02,  1.10it/s][A
Iteration:  11%|█▏        | 1016/9016 [15:22<2:

Train loss: 1.5788848806917668



Iteration:  22%|██▏       | 2001/9016 [30:16<2:50:10,  1.46s/it][A

Saving model checkpoint to /drive/My Drive/Medium/checkpoint-2000



Iteration:  22%|██▏       | 2002/9016 [30:17<2:29:17,  1.28s/it][A
Iteration:  22%|██▏       | 2003/9016 [30:18<2:16:17,  1.17s/it][A
Iteration:  22%|██▏       | 2004/9016 [30:19<2:07:17,  1.09s/it][A
Iteration:  22%|██▏       | 2005/9016 [30:19<2:01:06,  1.04s/it][A
Iteration:  22%|██▏       | 2006/9016 [30:20<1:56:28,  1.00it/s][A
Iteration:  22%|██▏       | 2007/9016 [30:21<1:53:20,  1.03it/s][A
Iteration:  22%|██▏       | 2008/9016 [30:22<1:50:58,  1.05it/s][A
Iteration:  22%|██▏       | 2009/9016 [30:23<1:49:16,  1.07it/s][A
Iteration:  22%|██▏       | 2010/9016 [30:24<1:48:10,  1.08it/s][A
Iteration:  22%|██▏       | 2011/9016 [30:25<1:47:37,  1.08it/s][A
Iteration:  22%|██▏       | 2012/9016 [30:26<1:47:28,  1.09it/s][A
Iteration:  22%|██▏       | 2013/9016 [30:27<1:46:44,  1.09it/s][A
Iteration:  22%|██▏       | 2014/9016 [30:28<1:46:13,  1.10it/s][A
Iteration:  22%|██▏       | 2015/9016 [30:28<1:46:00,  1.10it/s][A
Iteration:  22%|██▏       | 2016/9016 [30:29<1:

Train loss: 1.460765794535478



Iteration:  33%|███▎      | 3001/9016 [45:23<2:28:39,  1.48s/it][A

Saving model checkpoint to /drive/My Drive/Medium/checkpoint-3000



Iteration:  33%|███▎      | 3002/9016 [45:24<2:11:27,  1.31s/it][A
Iteration:  33%|███▎      | 3003/9016 [45:24<1:59:10,  1.19s/it][A
Iteration:  33%|███▎      | 3004/9016 [45:25<1:50:43,  1.10s/it][A
Iteration:  33%|███▎      | 3005/9016 [45:26<1:44:33,  1.04s/it][A
Iteration:  33%|███▎      | 3006/9016 [45:27<1:40:19,  1.00s/it][A
Iteration:  33%|███▎      | 3007/9016 [45:28<1:37:25,  1.03it/s][A
Iteration:  33%|███▎      | 3008/9016 [45:29<1:35:06,  1.05it/s][A
Iteration:  33%|███▎      | 3009/9016 [45:30<1:33:26,  1.07it/s][A
Iteration:  33%|███▎      | 3010/9016 [45:31<1:32:20,  1.08it/s][A
Iteration:  33%|███▎      | 3011/9016 [45:32<1:31:41,  1.09it/s][A
Iteration:  33%|███▎      | 3012/9016 [45:33<1:31:15,  1.10it/s][A
Iteration:  33%|███▎      | 3013/9016 [45:33<1:30:56,  1.10it/s][A
Iteration:  33%|███▎      | 3014/9016 [45:34<1:30:53,  1.10it/s][A
Iteration:  33%|███▎      | 3015/9016 [45:35<1:30:39,  1.10it/s][A
Iteration:  33%|███▎      | 3016/9016 [45:36<1:

In [None]:
output_dir = '/drive/My Drive/Medium/checkpoint-final'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)

In [None]:
train_loss_set_ckpt = torch.load('/drive/My Drive/Medium/checkpoint-final/training_loss.pt')
train_loss_set = to_list(train_loss_set_ckpt)

In [None]:
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

**Load test dataset**

In [None]:
input_file = '/drive/My Drive/Medium/dev-v2.0.json'
val_examples = read_squad_examples(input_file=input_file,
                                is_training=False,
                                version_2_with_negative=True)
doc_stride = 128
max_seq_length = 256
max_query_length = 64
cached_features_file = '/drive/My Drive/Medium/cache_validation'

# Cache features for faster loading
if not os.path.exists(cached_features_file):
  features = convert_examples_to_features(examples=val_examples,
                                        tokenizer=tokenizer,
                                        max_seq_length=max_seq_length,
                                        doc_stride=doc_stride,
                                        max_query_length=max_query_length,
                                        is_training=False)
  torch.save(features, cached_features_file)
else:
  features = torch.load(cached_features_file)

In [None]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                        all_example_index, all_cls_index, all_p_mask)

In [None]:
validation_sampler = SequentialSampler(dataset)
validation_dataloader = DataLoader(dataset, sampler=validation_sampler, batch_size=batch_size, drop_last=True)

**Evaluate test dataset**

In [None]:

def evaluate(model, tokenizer):
  print("***** Running evaluation *****")
  print("  Num examples = %d" % len(dataset))
  print("  Batch size = %d" % batch_size)
  all_results = []
  predict_file = '/drive/My Drive/Medium//dev-v2.0.json'
  for batch in tqdm(validation_dataloader, desc="Evaluating", miniters=100, mininterval=5.0):
    model.eval()
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
      inputs = {'input_ids':      batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2]
                }
      example_indices = batch[3]
      outputs = model(**inputs)

    for i, example_index in enumerate(example_indices):
      eval_feature = features[example_index.item()]
      unique_id = int(eval_feature.unique_id)

      result = RawResult(unique_id    = unique_id,
                         start_logits = to_list(outputs[0][i]),
                         end_logits   = to_list(outputs[1][i]))
      all_results.append(result)

  # Compute predictions
  output_prediction_file = "/drive/My Drive/Medium/predictions.json"
  output_nbest_file = "/drive/My Drive/Medium/nbest_predictions.json"
  output_null_log_odds_file = "/drive/My Drive/Medium/null_odds.json"
  output_dir = "/drive/My Drive/Medium/predict_results"

  write_predictions(val_examples, features, all_results, 10,
                  30, True, output_prediction_file,
                  output_nbest_file, output_null_log_odds_file, False,
                  True, 0.0)

  # Evaluate with the official SQuAD script
  evaluate_options = EVAL_OPTS(data_file=predict_file,
                               pred_file=output_prediction_file,
                               na_prob_file=output_null_log_odds_file,
                               out_image_dir=None)
  results = evaluate_on_squad(evaluate_options)
  return results

In [None]:
results = evaluate(model, tokenizer)

In [None]:
import json
results_json = []
for k in enumerate(results.keys()):
  result_dict = {k[1] : results[k[1]]}
  results_json.append(result_dict)
print(results_json)
with open('results.json', 'w') as f:
  json.dump(results_json, f)