In [None]:
import os
import math
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
#print(tf._version_)
import tensorflow  as keras

import bert as bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import rc
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
train  = pd.read_csv('./sample_data/combined.csv')
test = pd.read_csv('./sample_data/combined.csv')

In [None]:
train.shape

In [None]:
train.head()

In [None]:
chart = sns.countplot(train.Column3)
plt.title("Number of examples per Categories")
chart.set_xticklabels(chart.get_xticklabels(),rotation = 30,horizontalalignment='right')

In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

In [None]:
os.makedirs('model',exist_ok=True)

In [None]:
!mv uncased_L-12_H-768_A-12/ model

In [1]:
bert_model_name = 'uncased_L-12_H-768_A-12'
bert_ckpt_dir = os.path.join('model/',bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir,'bert_model.cpkt.index')
bert_config_file = os.path.join(bert_ckpt_dir,'bert_config.json')

bert_ckpt_dir

'model/uncased_L-12_H-768_A-12'

In [None]:
train.head()

In [None]:
class sentiment:
  data_column = 'Column2'
  label_column = 'Column3'

  def __init__(self,train,test,tokenizer: FullTokenizer,classes, max_seq_len = 192):
    self.tokenizer=tokenizer
    self.max_seq_len = 0
    self.classes = classes

    ((self.train_x,self.train_y),(self.test_x,self.test_y))=\
      map(self._prepare,[train,test])

    self.max_seq_len = min(self.max_seq_len,max_seq_len)
    self.train_x,self.test_x= map(self._pad,[self.train_x,self.test_x])


  def _prepare(self,df):
    x,y = [],[]
    for _, row in tqdm(df.iterrows()):
      text,label =\
        row[sentiment.data_column],row[sentiment.label_column]

      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"]+tokens+["[SEP]"]

      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

      self.max_seq_len = max(self.max_seq_len,len(token_ids))

      x.append(token_ids)
      y.append(self.classes.index(label))
      
    return np.array(x),np.array(y)


  def _pad(self,ids):
    x = []

    for input_ids in ids:
      cut_point=min(len(input_ids),self.max_seq_len -2)
      input_ids = input_ids[:cut_point]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))

    return np.array(x)

In [None]:
tokenizer.tokenize("I Can't wait to go!")

In [None]:
tokens = tokenizer.tokenize("I Can't wait to go!")
tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def create_model(max_seq_len, bert_ckpt_file):
  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")
  input_ids = tf.keras.layers.Input(
    shape=(max_seq_len, ),
    dtype='int32',
    name="input_ids"
  )
  bert_output = bert(input_ids)
  print("bert shape", bert_output.shape)
  cls_out = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
  cls_out = tf.keras.layers.Dropout(0.5)(cls_out)
  logits = tf.keras.layers.Dense(units=768, activation="tanh")(cls_out)
  logits = tf.keras.layers.Dropout(0.5)(logits)
  logits = tf.keras.layers.Dense(
    units=len(classes),
    activation="softmax"
  )(logits)
  model = tf.keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))
  load_stock_weights(bert, bert_ckpt_file)
  return model

In [None]:
classes = train.Column3.unique().tolist()
data = sentiment(train,test,tokenizer,classes,max_seq_len=768)

In [None]:
model = create_model(data.max_seq_len,'model/uncased_L-12_H-768_A-12/bert_model.ckpt' )

In [None]:
data.train_x.shape
model.summary()

In [None]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(1e-5),loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics = [keras.metrics.SparseCategoricalAccuracy(name='acc')]

)

In [None]:
log_dir = 'log/sentiment'+datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
history = model.fit(
    x=data.train_x,
    y=data.train_y,
    validation_split = 0.1,
    batch_size = 16,
    shuffle = True,
    epochs = 5,
    callbacks= [tensorboard_callback]
)

In [None]:
sentences = ['bad','i hate you']

tokens = map(tokenizer.tokenize,sentences)
tokens = map(lambda tok:["[CLS]"]+tok+["[SEP]"],tokens)

token_ids = list(map(tokenizer.convert_tokens_to_ids,tokens))

token_ids = map(lambda tids:tids + [0]*(data.max_seq_len-len(tids)),token_ids)
token_ids = np.array(list(token_ids))

predictions = model.predict(token_ids).argmax(axis=-1)
for text,label in zip(sentences,predictions):
  print("text:",text)
  print("intent",classes[label])