In [None]:
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import bert
import os
import re
import numpy as np
from tqdm import tqdm
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Dense, Embedding, Activation, LSTM, SimpleRNN, Dropout
from tensorflow.keras.models import Sequential, Model


In [None]:
class LoadData():
    def __init__(self,csv_file):
        self.df = pd.read_csv(os.path.join(os.getcwd(),csv_file))
        self.train_df = None
        self.test_df = None
    def load_data(self):
        self.df.columns = ['sentence','sentiment']
        self.train_df = self.df[self.df['sentiment']=='positive']
        self.test_df = self.df[self.df['sentiment']=='negative']
        self.train_df.loc[self.train_df['sentiment']=='positive','polarity'] = 1
        self.test_df.loc[self.test_df['sentiment']=='negative','polarity'] = 0


In [None]:
loaddata_obj = LoadData("imdb_dataset_small.csv") 
loaddata_obj.load_data()

In [None]:
loaddata_obj.train_df.head()

In [None]:
loaddata_obj.test_df.head()

In [None]:
class BertModel(object):
    
    def __init__(self):
        
        self.max_len = 128
        bert_path = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
        FullTokenizer=bert.bert_tokenization.FullTokenizer
        
        self.bert_module = hub.KerasLayer(bert_path,trainable=True)

        self.vocab_file = self.bert_module.resolved_object.vocab_file.asset_path.numpy()

        self.do_lower_case = self.bert_module.resolved_object.do_lower_case.numpy()

        self.tokenizer = FullTokenizer(self.vocab_file,self.do_lower_case)
        
    def get_masks(self,tokens, max_seq_length):
        return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

    def get_segments(self,tokens, max_seq_length):
        """Segments: 0 for the first sequence, 1 for the second"""
        segments = []
        current_segment_id = 0
        for token in tokens:
            segments.append(current_segment_id)
            if token == "[SEP]":
                current_segment_id = 1
        return segments + [0] * (max_seq_length - len(tokens))
    
    def get_ids(self,tokens, tokenizer, max_seq_length):
        """Token ids from Tokenizer vocab"""
        token_ids = tokenizer.convert_tokens_to_ids(tokens,)
        input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
        return input_ids
    def create_single_input(self,sentence,maxlen):

        stokens = self.tokenizer.tokenize(sentence)

        stokens = stokens[:maxlen]

        stokens = ["[CLS]"] + stokens + ["[SEP]"]

        ids = self.get_ids(stokens, self.tokenizer, self.max_len)
        masks = self.get_masks(stokens, self.max_len)
        segments = self.get_segments(stokens, self.max_len)

        return ids,masks,segments

    def create_input_array(self,sentences):
        
        input_ids, input_masks, input_segments = [], [], []

        for sentence in tqdm(sentences,position=0, leave=True):
            ids,masks,segments=self.create_single_input(sentence,self.max_len-2)

            input_ids.append(ids)
            input_masks.append(masks)
            input_segments.append(segments)
            
        tensor = [np.asarray(input_ids, dtype=np.int32), 
                np.asarray(input_masks, dtype=np.int32), 
                np.asarray(input_segments, dtype=np.int32)]
        return tensor

In [None]:
class PreprocessingBertData():
    
    def prepare_data_x(self,train_sentences):
        x = bert_model_obj.create_input_array(train_sentences)
        return x
    
    def prepare_data_y(self,train_labels):
        y = list()
        for item in train_labels:
            label = item
            y.append(label)
        y = np.array(y)
        return y
        

In [None]:
class BertFineTune(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="first",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        super(BertLayer, self).__init__(**kwargs)
    def build(self, input_shape):
        self.bert = hub.Module(self.bert_path, trainable=self.trainable, name=f"{self.name}_module")

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)
    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids)
        
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)["pooled_output"]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)["sequence_output"]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [None]:
bert_model_obj = BertModel()
preprocess_bert_data_obj = PreprocessingBertData()

In [None]:
train_sentences = loaddata_obj.train_df["sentence"].tolist()
train_labels = loaddata_obj.train_df["polarity"].tolist()

In [None]:
x = preprocess_bert_data_obj.prepare_data_x(train_sentences)
y = preprocess_bert_data_obj.prepare_data_y(train_labels)

train_input_ids, train_input_masks, train_segment_ids = x
train_labels = y

In [None]:
class DesignModel():
    def __init__(self):
        self.model = None        
        self.train_data = [train_input_ids, train_input_masks, train_segment_ids]
        self.train_labels = train_labels
        
    def bert_model(self,max_seq_length): 
        in_id = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_ids")
        in_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_masks")
        in_segment = Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")
        
        bert_inputs = [in_id, in_mask, in_segment]
        pooled_output, sequence_output = bert_model_obj.bert_module(bert_inputs)
        
        x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
        x = tf.keras.layers.Dropout(0.2)(x)
        out = tf.keras.layers.Dense(1, activation="sigmoid", name="dense_output")(x)
        self.model = tf.keras.models.Model(inputs=bert_inputs, outputs=out)
        
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        self.model.summary()
    
    def model_train(self,batch_size,num_epoch):
        print("Fitting to model")
        
        self.model.fit(self.train_data,self.train_labels,epochs=num_epoch,batch_size=batch_size,validation_split=0.2,shuffle=True)
        
        print("Model Training complete.")

    def save_model(self,model,model_name):    
        self.model.save(model_name+".h5")
        print("Model saved to Model folder.")

In [None]:
model_obj = DesignModel()
model_obj.bert_model(bert_model_obj.max_len)
model_obj.model_train(32,1)

In [None]:
model_obj.save_model(model_obj.model,"bert")

In [None]:
class Evaluation():
    def get_accuracy(self,actuals, predictions):
        acc = accuracy_score(actuals, predictions)
        return acc

In [None]:
class Prediction():
    def __init__(self):
        self.model = model_obj.model
        
    def predict_validation(self):
        valid_sentences = load_data_obj.validation_data_frame["query"].tolist()
        valid_labels = load_data_obj.validation_data_frame["category"].tolist()

        preprocess_bert_data_obj = PreprocessingBertData()
        val_x = preprocess_bert_data_obj.prepare_data_x(valid_sentences)
        prediction_labels = list(self.model.predict(val_x).argmax(axis=-1))
        return valid_labels,prediction_labels
        
    
    def predict(self,query):
        query_seq = bert_model_obj.create_input_array([query])
        pred = self.model.predict(query_seq)
        pred = np.argmax(pred)
        result = load_data_obj.cat_to_intent[pred]
        return result

In [None]:
pred_obj = Prediction()

In [None]:
eval_obj = Evaluation()
ytest,ypred = pred_obj.predict_validation()
acc = eval_obj.get_accuracy(ytest,ypred)
print("Auc: {:.2%}".format(acc))