In [None]:
 # https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz
! pip install bert-for-tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/af/c1/015648a2186b25c6de79d15bec40d3d946fcf1dd5067d1c1b28009506486/bert-for-tf2-0.14.6.tar.gz (40kB)
[K     |████████                        | 10kB 26.5MB/s eta 0:00:01[K     |████████████████                | 20kB 2.9MB/s eta 0:00:01[K     |████████████████████████▏       | 30kB 3.6MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 2.4MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [None]:
import pandas as pd
import numpy as np
import os
import math
import datetime
from tqdm import tqdm
import requests
import tensorflow as tf
import tarfile
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mp
from sklearn.metrics import confusion_matrix,classification_report
import keras
# imports for bert
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig,map_stock_config_to_params,load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer


In [None]:
url = 'https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz'
r = requests.get(url, allow_redirects=True)
open('text.tgz', 'wb').write(r.content)

my_tar = tarfile.open('/content/text.tgz')
my_tar.extractall('/content') # specify which folder to extract to
my_tar.close()


In [None]:
f = open("/content/yelp_review_polarity_csv/readme.txt", "r")
print(f.read())

Yelp Review Polarity Dataset

Version 1, Updated 09/09/2015

ORIGIN

The Yelp reviews dataset consists of reviews from Yelp. It is extracted from the Yelp Dataset Challenge 2015 data. For more information, please refer to http://www.yelp.com/dataset_challenge

The Yelp reviews polarity dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the above dataset. It is first used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015).


DESCRIPTION

The Yelp reviews polarity dataset is constructed by considering stars 1 and 2 negative, and 3 and 4 positive. For each polarity 280,000 training samples and 19,000 testing samples are take randomly. In total there are 560,000 trainig samples and 38,000 testing samples. Negative polarity is class 1, and positive class 2.

The files train.csv and test.csv contain all the

In [None]:
train = pd.read_csv('/content/yelp_review_polarity_csv/train.csv',header=None,names=['intent','text'])
test = pd.read_csv('/content/yelp_review_polarity_csv/test.csv',header=None,names=['intent','text'])

train = pd.DataFrame(
    {'text' : train['text'],
    'intent' : train['intent']})

test = pd.DataFrame(
    {'text' : test['text'],
    'intent' : test['intent']})


In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

--2020-09-24 16:20:37--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.195.128, 74.125.142.128, 74.125.20.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.195.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2020-09-24 16:20:40 (125 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [None]:
tokenizer = FullTokenizer(
  vocab_file=os.path.join('/content/uncased_L-12_H-768_A-12/vocab.txt')
)

In [None]:
tokenizer.tokenize("I can't wait to exercise!")

['i', 'can', "'", 't', 'wait', 'to', 'exercise', '!']

In [None]:
tokens=tokenizer.tokenize("I can't wait wait to exercise!")
tokenizer.convert_tokens_to_ids(tokens)

[1045, 2064, 1005, 1056, 3524, 3524, 2000, 6912, 999]

In [None]:
bert_config_file = os.path.join('/content/uncased_L-12_H-768_A-12/bert_config.json')

# preprocess

In [None]:
def preprocess_input(x):
  k=tokenizer.convert_tokens_to_ids(["[CLS]"]+tokenizer.tokenize(x['text'])+["[SEP]"])
  if len(k)<200 : k = k[:200]
  else: k = k + [0]*(200-len(k))
  return k                              

In [None]:
def preprocess_input_2(x):
  k=tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x['text']))
  if len(k)>198 : k = k[:198]

  k=tokenizer.convert_tokens_to_ids(["[CLS]"])+k+tokenizer.convert_tokens_to_ids(["[SEP]"])
  if len(k)<200:
    k= k + [0]*(200-len(k))

  return k  

In [None]:
#train_1=pd.DataFrame(train.iloc[:1000,:].apply(lambda x: pd.Series([tokenizer.convert_tokens_to_ids(["[CLS]"]+tokenizer.tokenize(x['text'])+["[SEP]"]),x['intent']], index=['text', 'intent']), axis=1))

In [None]:
train_processed=pd.DataFrame(train.iloc[:50000,:].apply(lambda x: pd.Series([preprocess_input_2(x),x['intent']], index=['text', 'intent']), axis=1))

In [None]:
test_processed=pd.DataFrame(test.iloc[:50,:].apply(lambda x: pd.Series([preprocess_input_2(x),x['intent']], index=['text', 'intent']), axis=1))

In [None]:
len(test_processed.iloc[0,0])

200

# Model Training

In [None]:
with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")

In [None]:
input_ids = keras.layers.Input(
    shape=(200, ),
    dtype='int32',
    name="input_ids"
  )

In [None]:
bert_output = bert(input_ids)

In [None]:
print("bert shape", bert_output.shape)

bert shape (None, 200, 768)


In [None]:
cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
cls_out = keras.layers.Dropout(0.5)(cls_out)
logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
logits = keras.layers.Dropout(0.5)(logits)
logits = keras.layers.Dense(units=2,activation="softmax")(logits)

In [None]:
model = keras.Model(inputs=input_ids, outputs=logits)

In [None]:
model.build(input_shape=(None, 200))

In [None]:
# 1e-5
model.compile(
  optimizer=keras.optimizers.Adam(1e-5),
  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [None]:
model.fit(
  x=np.array(list(pd.Series(train_processed['text'],index=None))),
  y=train_processed['intent'].to_numpy()-1,
  validation_split=0.02,
  batch_size=16,
  shuffle=True,
  epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fc1e3f177f0>

# Prediction

In [None]:
_, train_acc = model.evaluate(np.array(list(pd.Series(train_processed['text'],index=None))), train_processed['intent'].to_numpy()-1)
_, test_acc =  model.evaluate(np.array(list(pd.Series(test_processed['text'],index=None))), test_processed['intent'].to_numpy()-1)
print("train acc", train_acc)
print("test acc", test_acc)

train acc 0.9012600183486938
test acc 0.9599999785423279


# Rough - dump

In [None]:
'''class IntentDetectionData:
  DATA_COLUMN = "text"
  LABEL_COLUMN = "intent"
  def __init__(
    self,
    train,
    test,
    tokenizer: FullTokenizer,
    classes,
    max_seq_len=192
  ):
    self.tokenizer = tokenizer
    self.max_seq_len = 0
    self.classes = classes
    ((self.train_x, self.train_y), (self.test_x, self.test_y)) =\
     map(self._prepare, [train, test])
    print("max seq_len", self.max_seq_len)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.train_x, self.test_x = map(
      self._pad,
      [self.train_x, self.test_x]
    )
  def _prepare(self, df):
    x, y = [], []
    for _, row in tqdm(df.iterrows()):
      text, label =\
       row[IntentDetectionData.DATA_COLUMN], \
       row[IntentDetectionData.LABEL_COLUMN]
      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(self.classes.index(label))
    return np.array(x), np.array(y)
  def _pad(self, ids):
    x = []
    for input_ids in ids:
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))
    return np.array(x)
    
    
  classes = train.intent.unique().tolist()
data = IntentDetectionData(
  train.iloc[:100,:],
  test.iloc[:100,:],
  tokenizer,
  classes,
  max_seq_len=128
)  
 '''