In [None]:
#Install bert package for tensorflow v1
!pip install bert-tensorflow==1.0.1
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

from datetime import datetime
import keras
from keras import layers
from keras.callbacks import ReduceLROnPlateau
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tqdm.notebook import tqdm #adds progress bars to show loop status
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

class GenerateBertFeatures(object):

    """BERT cannot consume texts as is in an alphabetic format. Thus, we need to convert input texts to a format which BERT understands. 
    This class consists of functions which will be used to convert the inout texts first into input examples and then into input features.
    Order of execution from top to bottom -
    GetFeatures -> CreateTokenizerFromHubModule -> GetInputExamples"""

    def __init__(self, data, config):
        self.config = config
        self.data = data

    def GetInputExamples(self):

        """
        This function converts the input texts into inout examples which conists of 4 different entities for every input:
        guid: Unique id for the example.
        text_a: String data. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified.
        text_b: (Optional) string data. The untokenized text of the second sequence. Only must be specified for sequence pair tasks.
        label: String data. The label of the example. This should be specified for train and evaluate examples, but not for test examples.
        """

        DATA_COLUMN = "text"
        LABEL_COLUMN = "label"
        train_InputExamples = self.data.train_df.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)
    
        val_InputExamples = self.data.test_df.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)
        
        return train_InputExamples, val_InputExamples

    def CreateTokenizerFromHubModule(self, BERT_MODEL_HUB):

        """Get the vocab file and casing info from the Hub module."""

        with tf.Graph().as_default():
            bert_module = hub.Module(BERT_MODEL_HUB)
            tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
            with tf.Session() as sess:
                vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
        return bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

    def GetFeatures(self, BERT_MODEL_HUB):

        """This function takes input from CreateTokenizerFromHubModule() and GetInputExamples() and then converts the input examples into 
        BERT features. Parameter is the bert model downloaded at the very beginning of this notebook."""

        self.tokenizer = self.CreateTokenizerFromHubModule(BERT_MODEL_HUB)
        train_InputExamples, val_InputExamples = self.GetInputExamples()
        self.train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, self.config.labelList, self.config.maxSeqLength, self.tokenizer)
        self.val_features = bert.run_classifier.convert_examples_to_features(val_InputExamples, self.config.labelList, self.config.maxSeqLength, self.tokenizer)

        # Create an input function for training. drop_remainder = True for using TPUs.
        self.train_input_fn = bert.run_classifier.input_fn_builder(
            features=self.train_features,
            seq_length=self.config.maxSeqLength,
            is_training=True,
            drop_remainder=False)

        # Create an input function for validating. drop_remainder = True for using TPUs.
        self.val_input_fn = run_classifier.input_fn_builder(
            features=self.val_features,
            seq_length=self.config.maxSeqLength,
            is_training=False,
            drop_remainder=False)