# Introduction

1. In this code challenge you are expected to run a pytorch classifier, distilbert, and perform / report various optimizations.
2. Some starter code will be provided using HuggingFace's transformers library, but you're more than welcome to swap in code from a different framework if you are more familiar with it.
3. Make sure to set the "Runtime" type of the COLAB to use the GPU.

In [None]:
!pip install transformers==4.4.2

We now import a tokenizer and model, and setup the config.

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
config = AutoConfig.from_pretrained('distilbert-base-uncased', num_labels=2)
model = AutoModelForSequenceClassification.from_config(config)

We use the famous Stanford Sentiment Treebank test, with 2 labels of "positive" and "negative" respectively.  You can find more about the dataset here: [kaggle version of sst-2](https://www.kaggle.com/atulanandjha/stanford-sentiment-treebank-v2-sst2).

In [None]:
%%capture
!pip install torch
!pip install scikit-learn
!pip install datasets==1.5
from datasets import load_dataset
sst_dataset = load_dataset('glue', 'sst2')

In [None]:
!pip install tqdm==4.41.1

In [None]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Training
We set up code now to train the model.

In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook
from transformers import AdamW

dataset = load_dataset('glue', 'sst2', split='train')

split_dataset = dataset.train_test_split(test_size=0.2)

train = split_dataset["train"]
test = split_dataset["test"]

# Set your batch size to reasonable values here
batch_size = 8
train_tokenized = train.map(lambda batch: tokenizer(batch["sentence"], truncation=True, padding='longest', max_length=256), batched=True,
                            batch_size = len(train))
train_tokenized.rename_column_("label", "labels")
train_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

test_tokenized = test.map(lambda batch: tokenizer(batch["sentence"], truncation=True, padding='longest', max_length=256), batched=True, 
                          batch_size = len(test))
test_tokenized.rename_column_("label", "labels")
test_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

model.to(device)
model.train()

train_loader = DataLoader(train_tokenized, shuffle=False, batch_size=batch_size)
test_loader = DataLoader(test_tokenized, shuffle=False, batch_size=batch_size)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=batch_size,   # batch size for evaluation
    save_total_limit=2,
    learning_rate=1e-5,
    eval_steps=250,
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_tokenized,         # training dataset
    eval_dataset=test_tokenized            # evaluation dataset
)

trainer.train()
print("Training finished")
model.eval()


# Inference
We set up code now for running inference.



In [None]:
import numpy as np

int_to_labels = {0: "negative", 1: "positive"}
 # You will need to tweak this code slightly, to return confidence as well as the predicted label.

def predict_batch(texts, trained_model, tokenizer):
  tokenized = tokenizer(texts, truncation=True, padding='longest', max_length=256, return_tensors='pt')
  tokenized = tokenized.to(device)
  outputs = model(input_ids = tokenized["input_ids"], attention_mask = tokenized["attention_mask"])
  int_predictions = np.argmax(outputs[0].detach().cpu().numpy(), axis=1)
  string_predictions = [int_to_labels[prediction] for prediction in int_predictions]
  return string_predictions

predict_batch(["I am happy", "I love life", "I am sad", "I am disappointed.", "I am joyful."], model, tokenizer)

# Accuracy
We compute the accuracy of the model on validation dataset. Notice we also introduce a "batch" function to break dataset into smaller chunks that fit in memory when the model evaluates.

In [None]:
sentences = []
labels = []
for item in test:
  labels.append(int_to_labels[item['label']])
  sentences.append(item['sentence'])

def batch(input_list, batch_size):
    # looping till length l
    for i in range(0, len(input_list), batch_size): 
        yield input_list[i:i + batch_size]

results = []
for curr_batch in tqdm_notebook(batch(sentences, 16)):
  results.extend(predict_batch(curr_batch, model, tokenizer))

num_correct = sum(result == label for result, label in zip(results, labels))
print(f"Accuracy is {round(100*num_correct / len(results), 2)}%")

# Save / load

You may save load by using the `save_pretrained` method of huggingface models, as follows.

In [None]:
directory_name = "my_sst2_tuned_model"
model.save_pretrained(directory_name)
loaded_model = AutoModelForSequenceClassification.from_pretrained(directory_name)

# Qs 1: Inference
Modify the predict_batch function, so that it returns confidences in addition to the labels.

In [None]:
import numpy as np
int_to_labels = {0: "negative", 1: "positive"}

 # You will need to tweak this code slightly, to return confidence as well as the predicted label.
device = torch.device('cpu')
def predict_batch_confidence(texts, trained_model, tokenizer):
  tokenized = tokenizer(texts, truncation=True, padding='longest', max_length=256, return_tensors='pt')
  tokenized = tokenized.to(device)
  outputs = model(input_ids = tokenized["input_ids"], attention_mask = tokenized["attention_mask"])
  int_predictions = np.argmax(outputs[0].detach().cpu().numpy(), axis=1)
  string_predictions = [int_to_labels[int_predictions[prediction].item()] for prediction in range(int_predictions.shape[0])]
  confidences = torch.softmax(outputs[0], dim=1).tolist()
  print(confidences)
  score = []
  for i in range(len(string_predictions)):
    if string_predictions[i] == "negative":
      score.append(confidences[i][0])
    else:
      score.append(confidences[i][1])
  return list(zip(string_predictions,score))
    
  

predict_batch_confidence(["I am happy", "I love life", "I am sad", "I am disappointed.", "I am joyful."], model, tokenizer)

# Qs 2: Optimization
Graph the throughput and inferences / per second you can get with varying batch_size.


In [None]:
!pip install matplotlib
!pip install datasets==1.5

In [None]:
import time

results = list()
vary_batch = [8,16,32,64]
for size in vary_batch:
  print("starting on:", size)
  start = time.time()
  batch_count=0
  test_batch = DataLoader(train_tokenized, shuffle=False, batch_size=size)
  for batch in test_batch:
    batch_count += 1
    batch = {key: val.to(device) for key, val in batch.items()}
    output = model(input_ids = batch["input_ids"], attention_mask = batch["attention_mask"])
    # print("completed", batch_count)
    if time.time() - start > 60: #aviod the model runing through the whole dataset
      break
  end = time.time()
  batch_result = (batch_count * size) / (end - start)
  results.append(batch_result)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
print(results)
plt.plot(vary_batch, results)
plt.xlabel("batch size")
plt.ylabel("inferences/second")
plt.title("Throughput by batch size")
plt.show()

NOTE: looks like as the batch size increase the inference / sec also increase which is very common

# Qs 3: Quantization
Can you quantize the model? What accuracy do you lose / gain by doing so, and what is the impact on throughput? You may find this tutorial helpful: https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html

In [None]:
#copy from the link above
import torch
model=model.cpu()
quantized_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

In [None]:
import os
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

In [None]:
#copied from above
def accuracy(model, temp_device):
  sentences = []
  labels = []
  for item in test:
    labels.append(int_to_labels[item['label']])
    sentences.append(item['sentence'])

  def batch(input_list, batch_size):
      # looping till length l
      for i in range(0, len(input_list), batch_size): 
          yield input_list[i:i + batch_size]

  results = []
  for curr_batch in tqdm_notebook(batch(sentences, 16)):
    results.extend(predict_batch(curr_batch, model, tokenizer))
  num_correct = sum(result == label for result, label in zip(results, labels))
  print(f"Accuracy is {round(100*num_correct / len(results), 2)}%")
  accuracy_score = 100*num_correct / len(results)
  return accuracy_score

In [None]:
print("Quantized Model Accuracy")
quant_acc = accuracy(quantized_model, device)

NOTE: Accuracy is the same

In [None]:
#measure throughput with the reuse code from Qs2
def throughput(model, device):
  results = list()
  vary_batch = [8,16,32,64]
  for size in vary_batch:
    print("starting on:", size)
    start = time.time()
    batch_count=0
    test_batch = DataLoader(train_tokenized, shuffle=False, batch_size=size)
    for batch in test_batch:
      batch_count += 1
      batch = {key: val.to(device) for key, val in batch.items()}
      output = model(input_ids = batch["input_ids"], attention_mask = batch["attention_mask"])
      # print("completed", batch_count)
      if time.time() - start > 60: #aviod the model runing through the whole dataset
        break
    end = time.time()
    batch_result = (batch_count * size) / (end - start)
    results.append(batch_result)
  plt.plot(vary_batch, results)
  plt.xlabel("batch size")
  plt.ylabel("inferences/second")
  plt.title("Throughput by batch size")
  plt.show()
  return results

In [None]:

print("Original Model Throughtput")
org_results = throughput(model, device)
print("Quantized Model Throughtput")
quant_result = throughput(quantized_model, device)

After the model been quantized the model accuracy were the same as the orginial model. But the quantized model only uses half of the memory compared to the orginial model. The quantized model throughtput also decrease as the batch size increase, which was not the case for the orginal model.

In [None]:
#save quantized model
directory_name = "my_sst2_quantized_model"
quantized_model.save_pretrained(directory_name)