## Author of this notebook : Ramsri Goutham Golla

Linkedin: https://www.linkedin.com/in/ramsrig/

Twitter : https://twitter.com/ramsri_goutham

## 1. T5 question generation model

In [None]:
# !pip install --quiet transformers==4.5.0
# We are installing this specific commit of transformers because this adds support for exporting of t5 to onnx for FastT5 library.
# https://github.com/huggingface/transformers/commit/5c00918681d6b4027701eb46cea8f795da0d4064
!pip install --quiet git+https://github.com/huggingface/transformers.git@5c00918681d6b4027701eb46cea8f795da0d4064
!pip install --quiet sentencepiece==0.1.95

In [None]:
!pip install --quiet ipython-autotime
%load_ext autotime

time: 332 µs (started: 2021-05-18 13:29:12 +00:00)


In [None]:
from transformers import T5ForConditionalGeneration,T5Tokenizer

#T5 model size on disk ~ 900 MB
question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1208.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891695056.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1786.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1856.0, style=ProgressStyle(description…


time: 46.1 s (started: 2021-05-18 13:29:18 +00:00)


In [None]:
def get_question(sentence,answer,mdl,tknizer):
  text = "context: {} answer: {}".format(sentence,answer)
  print (text)
  max_len = 256
  encoding = tknizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = mdl.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=300)


  dec = [tknizer.decode(ids,skip_special_tokens=True) for ids in outs]


  Question = dec[0].replace("question:","")
  Question= Question.strip()
  return Question


context = "Elon Musk said that Tesla will not accept payments in Bitcoin because of environmental concerns."
answer = "Elon Musk"

ques = get_question(context,answer,question_model,question_tokenizer)
print ("question: ",ques)



context: Elon Musk said that Tesla will not accept payments in Bitcoin because of environmental concerns. answer: Elon Musk




question:  Who said that Tesla would not accept Bitcoin payments?
time: 1.89 s (started: 2021-05-18 13:31:48 +00:00)


## 2. First taste of production deployment. Creating an UI with Gradio app.
https://www.gradio.app/

In [None]:
!pip install --quiet gradio==1.6.4

[K     |████████████████████████████████| 1.1MB 9.3MB/s 
[K     |████████████████████████████████| 215kB 36.9MB/s 
[K     |████████████████████████████████| 71kB 8.6MB/s 
[K     |████████████████████████████████| 3.2MB 38.6MB/s 
[K     |████████████████████████████████| 962kB 49.8MB/s 
[?25h  Building wheel for Flask-BasicAuth (setup.py) ... [?25l[?25hdone
  Building wheel for ffmpy (setup.py) ... [?25l[?25hdone
  Building wheel for flask-cachebuster (setup.py) ... [?25l[?25hdone
time: 10.3 s (started: 2021-05-18 13:32:07 +00:00)


In [None]:
import gradio as gr

context = gr.inputs.Textbox(lines=5, placeholder="Enter paragraph/context here...")
answer = gr.inputs.Textbox(lines=3, placeholder="Enter answer/keyword here...")
question = gr.outputs.Textbox( type="auto", label="Question")

def generate_question(context,answer):
  return get_question(context,answer,question_model,question_tokenizer)

iface = gr.Interface(
  fn=generate_question, 
  inputs=[context,answer], 
  outputs=question)
iface.launch(debug=False)

## 3. Convert to T5 Pytorch model to Onnx Format and Quantize using FastT5 library

https://github.com/Ki6an/fastT5

Reduce T5 model size by 3X and increase the inference speed up to 5X.

In [None]:
rm -f -r models/

time: 152 ms (started: 2021-05-18 13:32:55 +00:00)


In [None]:
!pip install onnx==1.9.0
!pip install onnxruntime==1.7.0
!pip install fastt5==0.0.4

In [None]:
# Restart runtime by code.
import os
os.kill(os.getpid(), 9)

In [None]:
from fastT5 import export_and_get_onnx_model,generate_onnx_representation,quantize
from transformers import T5Config,AutoTokenizer

trained_model_path = 'ramsrigouthamg/t5_squad_v1'

# Step 1. convert huggingfaces t5 model to onnx
onnx_model_paths = generate_onnx_representation(trained_model_path)

# Step 2. (recommended) quantize the converted model for fast inference and to reduce model size.
quant_model_paths = quantize(onnx_model_paths)

tokenizer_onnx = AutoTokenizer.from_pretrained(trained_model_path)
config = T5Config.from_pretrained(trained_model_path)



[KExporting to onnx... |################################| 3/3
[KQuantizing... |################################| 3/3
[?25h

In [None]:
# save tokenizer also into models folder
tokenizer_onnx.save_pretrained('models/')
config.save_pretrained('models/')

**Remove non-quantized onnx files - Not needed for us**

In [None]:
rm -f -r models/*decoder.onnx

In [None]:
rm -f -r models/*encoder.onnx

In [None]:
!du -sh models

404M	models


## 4. Onnx Inference

In [None]:
!pip install --quiet fastt5==0.0.4

In [None]:
!pip install --quiet ipython-autotime
%load_ext autotime

In [None]:
from fastT5 import get_onnx_model,get_onnx_runtime_sessions,OnnxT5
from transformers import AutoTokenizer
from pathlib import Path
import os

trained_model_path = './models/'

pretrained_model_name = "t5_squad_v1"


encoder_path = os.path.join(trained_model_path,f"{pretrained_model_name}-encoder-quantized.onnx")
decoder_path = os.path.join(trained_model_path,f"{pretrained_model_name}-decoder-quantized.onnx")
init_decoder_path = os.path.join(trained_model_path,f"{pretrained_model_name}-init-decoder-quantized.onnx")

model_paths = encoder_path, decoder_path, init_decoder_path
model_sessions = get_onnx_runtime_sessions(model_paths)
model = OnnxT5(trained_model_path, model_sessions)

tokenizer = AutoTokenizer.from_pretrained(trained_model_path)

In [None]:
def get_question(sentence,answer,mdl,tknizer):
  text = "context: {} answer: {}".format(sentence,answer)
  print (text)
  max_len = 256
  encoding = tknizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = mdl.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=300)


  dec = [tknizer.decode(ids,skip_special_tokens=True) for ids in outs]


  Question = dec[0].replace("question:","")
  Question= Question.strip()
  return Question


context = "Elon Musk said that Tesla will not accept payments in Bitcoin because of environmental concerns."
answer = "Elon Musk"

ques = get_question(context,answer,model,tokenizer)
print ("question: ",ques)


context: Elon Musk said that Tesla will not accept payments in Bitcoin because of environmental concerns. answer: Elon Musk
question:  Who said that Tesla will not accept Bitcoin payments?


In [None]:
import gradio as gr

context = gr.inputs.Textbox(lines=5, placeholder="Enter paragraph/context here...")
answer = gr.inputs.Textbox(lines=3, placeholder="Enter answer/keyword here...")
question = gr.outputs.Textbox( type="auto", label="Question")

def generate_question(context,answer):
  return get_question(context,answer,model,tokenizer)

iface = gr.Interface(
  fn=generate_question, 
  inputs=[context,answer], 
  outputs=question)
iface.launch(debug=False)

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
This share link will expire in 24 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted (NEW!)
Running on External URL: https://27133.gradio.app
Interface loading below...


(<Flask 'gradio.networking'>,
 'http://127.0.0.1:7860/',
 'https://27133.gradio.app')