# Benchmarking with pytorch

In [1]:
from transformers import BertModel
model_name = 'bert-base-cased'
model = BertModel.from_pretrained(model_name)

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [2]:
# Move to gpu
import torch
device = torch.device('cuda')
model.to(device)
model.eval()
print('Assigned')

if (torch.cuda.is_available()):
  print("Cuda Avaliable")

Num GPUs Available:  0


## Greedy


In [3]:
from functools import wraps
import time
import numpy as np

average_runs = 4

def timeit(my_func):
    @wraps(my_func)
    def timed(*args, **kw):
        time_list = []
        for i in range(3):
          tstart = time.time()
          output = my_func(*args, **kw)
          tend = time.time()
          time_taken = tend-tstart
          time_list.append((tend - tstart))
          
        return np.mean(time_list[1:]) # return last 3
    return timed

In [None]:
timing_results = {}
batch_size_list = [1, 4, 8, 16, 32, 128]
max_seq_length_list = [20, 32, 128, 384, 512]

@timeit
def call_sequence(input_tensor):
  result = model(input_tensor)
  return result

minval = 1
maxval = 28996

timing_holder = {}
for batch in batch_size_list:
  for sequence_length in max_seq_length_list:
    input_tensor = torch.randint(minval, maxval, size=(batch,sequence_length), device=device)
    # because we need to genrate that much tokens from input_tensor_length
    average_time_taken = call_sequence(input_tensor)
    timing_holder[(batch, sequence_length)] = average_time_taken
    print("Done batch_size {} sequence length {}".format(batch, sequence_length))

    
timing_results['hf_pt_forward'] = timing_holder

Done batch_size 1 sequence length 20
Done batch_size 1 sequence length 32
Done batch_size 1 sequence length 128
Done batch_size 1 sequence length 384
Done batch_size 1 sequence length 512
Done batch_size 4 sequence length 20
Done batch_size 4 sequence length 32
Done batch_size 4 sequence length 128
Done batch_size 4 sequence length 384
Done batch_size 4 sequence length 512
Done batch_size 8 sequence length 20
Done batch_size 8 sequence length 32
Done batch_size 8 sequence length 128
Done batch_size 8 sequence length 384
Done batch_size 8 sequence length 512
Done batch_size 16 sequence length 20
Done batch_size 16 sequence length 32
Done batch_size 16 sequence length 128
Done batch_size 16 sequence length 384
Done batch_size 16 sequence length 512
Done batch_size 32 sequence length 20
Done batch_size 32 sequence length 32
Done batch_size 32 sequence length 128
Done batch_size 32 sequence length 384
Done batch_size 32 sequence length 512
Done batch_size 128 sequence length 20


In [None]:

import pickle
with open("/mnt/home/TF_NEW/bert_sequence_length_benchmark/tf_pt_bert_benchmark_gpu.pkl", "wb") as f:
    pickle.dump(timing_results, f)