# Intent Recognition with BERT using Keras and TensorFlow 2

In [103]:
!nvidia-smi

Sun May  2 20:27:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    27W /  70W |   4506MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [104]:
!pip install tensorflow-gpu >> /dev/null

In [105]:
!pip install --upgrade grpcio >> /dev/null

[31mERROR: tensorflow 2.4.1 has requirement grpcio~=1.32.0, but you'll have grpcio 1.37.1 which is incompatible.[0m
[31mERROR: tensorflow-gpu 2.4.1 has requirement grpcio~=1.32.0, but you'll have grpcio 1.37.1 which is incompatible.[0m


In [106]:
!pip install tqdm  >> /dev/null

In [107]:
!pip install bert-for-tf2 >> /dev/null

In [108]:
!pip install sentencepiece >> /dev/null

In [109]:
import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import rc

from sklearn.metrics import confusion_matrix, classification_report

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Data

The data contains various user queries categorized into seven intents. It is hosted on [GitHub](https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines) and is first presented in [this paper](https://arxiv.org/abs/1805.10190).

In [110]:
!gdown --id 1uRvd8zkyNdl3RA-TXOXciOrK6hmiWXme --output train.csv 
!gdown --id 1y83r_A02DYX3eqHnh9Usn9Ek1vBB0uV_ --output valid.csv 
!gdown --id 1KFO3rnFTisloLVoKmBUCwn8-qPVfLE0J --output test.csv

Downloading...
From: https://drive.google.com/uc?id=1uRvd8zkyNdl3RA-TXOXciOrK6hmiWXme
To: /content/train.csv
100% 3.84k/3.84k [00:00<00:00, 6.01MB/s]
Downloading...
From: https://drive.google.com/uc?id=1y83r_A02DYX3eqHnh9Usn9Ek1vBB0uV_
To: /content/valid.csv
100% 1.01k/1.01k [00:00<00:00, 1.57MB/s]
Downloading...
From: https://drive.google.com/uc?id=1KFO3rnFTisloLVoKmBUCwn8-qPVfLE0J
To: /content/test.csv
100% 1.13k/1.13k [00:00<00:00, 1.80MB/s]


In [111]:
train = pd.read_csv("train.csv")
valid = pd.read_csv("valid.csv")
test = pd.read_csv("test.csv")

In [112]:
train = train.append(valid).reset_index(drop=True)

In [113]:
train.shape

(94, 2)

In [114]:
train.head()

Unnamed: 0,text,intent
0,this is our project,nuetral
1,we are here,nuetral
2,the team is working,nuetral
3,hello,nuetral
4,good morning,nuetral


# Intent Recognition with BERT

In [115]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

--2021-05-02 20:28:00--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.195.128, 74.125.20.128, 74.125.142.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.195.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip.3’


2021-05-02 20:28:03 (146 MB/s) - ‘uncased_L-12_H-768_A-12.zip.3’ saved [407727028/407727028]



In [116]:
!unzip uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
replace uncased_L-12_H-768_A-12/bert_model.ckpt.meta? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/vocab.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/bert_model.ckpt.index? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/bert_config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 
error:  invalid response [{ENTER}]
replace uncased_L-12_H-768_A-12/bert_config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
os.makedirs("model", exist_ok=True)

In [None]:
!mv uncased_L-12_H-768_A-12/ model

In [119]:
bert_model_name="uncased_L-12_H-768_A-12"

bert_ckpt_dir = os.path.join("model/", bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")

## Preprocessing

In [120]:
class IntentDetectionData:
  DATA_COLUMN = "text"
  LABEL_COLUMN = "intent"

  def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192):
    self.tokenizer = tokenizer
    self.max_seq_len = 0
    self.classes = classes
    
    ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

    print("max seq_len", self.max_seq_len)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

  def _prepare(self, df):
    x, y = [], []
    
    for _, row in tqdm(df.iterrows()):
      text, label = row[IntentDetectionData.DATA_COLUMN], row[IntentDetectionData.LABEL_COLUMN]
      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(self.classes.index(label))

    return np.array(x), np.array(y)

  def _pad(self, ids):
    x = []
    for input_ids in ids:
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))
    return np.array(x)

In [121]:
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))

In [122]:
tokenizer.tokenize("I can't wait to visit Bulgaria again!")

['i', 'can', "'", 't', 'wait', 'to', 'visit', 'bulgaria', 'again', '!']

In [123]:
tokens = tokenizer.tokenize("I can't wait to visit Bulgaria again!")
tokenizer.convert_tokens_to_ids(tokens)

[1045, 2064, 1005, 1056, 3524, 2000, 3942, 8063, 2153, 999]

In [124]:
def create_model(max_seq_len, bert_ckpt_file):

  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")
        
  input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids")
  bert_output = bert(input_ids)

  print("bert shape", bert_output.shape)

  cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
  cls_out = keras.layers.Dropout(0.5)(cls_out)
  logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
  logits = keras.layers.Dropout(0.5)(logits)
  logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)

  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))

  load_stock_weights(bert, bert_ckpt_file)
        
  return model

## Training

In [125]:
classes = train.intent.unique().tolist()

data = IntentDetectionData(train, test, tokenizer, classes, max_seq_len=128)

94it [00:00, 3741.54it/s]
20it [00:00, 3515.76it/s]

max seq_len 21





In [126]:
data.train_x.shape

(94, 21)

In [127]:
data.train_x[0]

array([ 101, 2023, 2003, 2256, 2622,  102,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [128]:
data.train_y[0]

0

In [129]:
data.max_seq_len

21

In [130]:
model = create_model(data.max_seq_len, bert_ckpt_file)

bert shape (None, 21, 768)
Done loading 196 BERT weights from: model/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7fad0b0e1ad0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights


In [131]:
model.compile(
  optimizer=keras.optimizers.Adam(1e-5),
  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [132]:
log_dir = "log/intent_detection/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

history = model.fit(
  x=data.train_x, 
  y=data.train_y,
  validation_split=0.1,
  batch_size=16,
  shuffle=True,
  epochs=5,
  callbacks=[tensorboard_callback]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Evaluation

In [133]:
y_pred = model.predict(data.test_x).argmax(axis=-1)

In [134]:
print(classification_report(data.test_y, y_pred, target_names=classes))

              precision    recall  f1-score   support

     nuetral       0.80      0.40      0.53        10
      biased       0.60      0.90      0.72        10

    accuracy                           0.65        20
   macro avg       0.70      0.65      0.63        20
weighted avg       0.70      0.65      0.63        20



In [135]:
cm = confusion_matrix(data.test_y, y_pred)
df_cm = pd.DataFrame(cm, index=classes, columns=classes)

In [136]:
sentences = [
  "hello",
  "He is good"
]

pred_tokens = map(tokenizer.tokenize, sentences)
pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

pred_token_ids = map(lambda tids: tids +[0]*(data.max_seq_len-len(tids)),pred_token_ids)
pred_token_ids = np.array(list(pred_token_ids))

predictions = model.predict(pred_token_ids).argmax(axis=-1)

for text, label in zip(sentences, predictions):
  print("text:", text, "\nintent:", classes[label])
  print()

text: hello 
intent: nuetral

text: He is good 
intent: biased



# References

- https://mccormickml.com/2019/07/22/BERT-fine-tuning/
- https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines
- https://jalammar.github.io/illustrated-bert/
- https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03
- https://www.reddit.com/r/MachineLearning/comments/ao23cp/p_how_to_use_bert_in_kaggle_competitions_a/

In [137]:
!pip install flask-ngrok
!pip install pyrebase5



In [None]:
from flask import Flask, request
from flask_ngrok import run_with_ngrok
import pyrebase

firebaseConfig = {
  "apiKey": "AIzaSyBrey3ZZ5X74WrAQuj7HISWLl70PqP8dnA",
  "authDomain": "trialproject-55deb.firebaseapp.com",
  "databaseURL": "https://trialproject-55deb-default-rtdb.firebaseio.com",
  "projectId": "trialproject-55deb",
  "storageBucket": "trialproject-55deb.appspot.com",
  "messagingSenderId": "930590452475",
  "appId": "1:930590452475:web:d8857d9906874468fd5e5e"
}

firebase = pyrebase.initialize_app(firebaseConfig)
db = firebase.database()

app = Flask(__name__)
run_with_ngrok(app)

@app.route('/')
def hello():
  return "hello, world"

@app.route('/get_bias', methods = ['GET', 'POST'])
def get_bias():
  
  get_str = db.child("list").get().val()
  sentences = get_str.split("/")

  pred_tokens = map(tokenizer.tokenize, sentences)
  pred_tokens = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
  pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

  pred_token_ids = map(lambda tids: tids +[0]*(data.max_seq_len-len(tids)),pred_token_ids)
  pred_token_ids = np.array(list(pred_token_ids))

  predictions = model.predict(pred_token_ids).argmax(axis=-1)

  intent_list = []

  for text, label in zip(sentences, predictions):
    print("text:", text, "\n intent:", classes[label])
    if classes[label] == "biased":
      intent_list.append(text)
  db.child("result").set(intent_list)
  return {"bias" : intent_list}
#   return{"bias":[
#   "everyone should bring his notebook",
#   "tomorrow we have an Asian speaker ",
#   "job for a woman "
# ]}

app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://3b7d1a7b62f4.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [02/May/2021 20:29:44] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [02/May/2021 20:29:44] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [02/May/2021 20:29:45] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [02/May/2021 20:30:00] "[37mGET /get_bias HTTP/1.1[0m" 200 -


text: hi everyone let s get  
 intent: nuetral
text: started ghost us next sunday  
 intent: nuetral
text: everyone bring his notebook tomorrow  
 intent: biased
text: we asian speaker coming lecture  
 intent: nuetral
text: she s good job woman  
 intent: nuetral
text:   
 intent: nuetral
