In [None]:
#Install bert package for tensorflow v1
!pip install bert-tensorflow==1.0.1
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

from datetime import datetime
import keras
from keras import layers
from keras.callbacks import ReduceLROnPlateau
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tqdm.notebook import tqdm #adds progress bars to show loop status
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

class Generator(object):

    """This class consists of functions to convert the training, validation and test datasets into a format acceptable by LSTM model. 
    LSTM takes inputs with fixed width only. But the vector representations of every report are of variable length as different reports have
    different number of words and thus different number of text splits. Each generator function takes batches of given size, gets the size of 
    the largest input and extends the remaining inputs to the size of the largest, filling them with a special value. This process is along all 
    the data. This way, all batches sequences would have the same length. """

    def __init__(self, config):
        self.config = config

    def train_generator(self, df):
        num_sequences = len(df['emb'].to_list())
        batch_size = self.config.training.batch_size_train
        batches_per_epoch = self.config.training.batches_per_epoch_train

        #make sure that all input data passes throught training
        assert batch_size * batches_per_epoch == num_sequences   

        num_features= 768
        x_list= df['emb'].to_list()
        y_list =  df.label.to_list()
        # Generate batches
        while True:
            for b in range(batches_per_epoch):
                longest_index = (b + 1) * batch_size - 1
                timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size][-batch_size:], key=len))
                x_train = np.full((batch_size, timesteps, num_features), -99.)
                y_train = np.zeros((batch_size,  1))
                for i in range(batch_size):
                    li = b * batch_size + i
                    x_train[i, 0:len(x_list[li]), :] = x_list[li]
                    y_train[i] = y_list[li]
                yield x_train, y_train

    def val_generator(self, df):
        num_sequences = len(df['emb'].to_list())
        batch_size = self.config.training.batch_size_val
        batches_per_epoch = self.config.training.batches_per_epoch_val

        #make sure that all input data passes throught training
        assert batch_size * batches_per_epoch == num_sequences

        num_features= 768
        x_list= df['emb'].to_list()
        y_list =  df.label.to_list()
        # Generate batches
        while True:
            for b in range(batches_per_epoch):
                longest_index = (b + 1) * batch_size - 1
                timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size][-31:], key=len))
                x_train = np.full((batch_size, timesteps, num_features), -99.)
                y_train = np.zeros((batch_size,  1))
                for i in range(batch_size):
                    li = b * batch_size + i
                    x_train[i, 0:len(x_list[li]), :] = x_list[li]
                    y_train[i] = y_list[li]
                yield x_train, y_train

    def test_generator(self, df):
        num_sequences = len(df['emb'].to_list())
        batch_size = self.config.training.batch_size_test
        batches_per_epoch = self.config.training.batches_per_epoch_test

        #make sure that all input data passes throught training
        assert batch_size * batches_per_epoch == num_sequences
        
        num_features= 768
        x_list= df['emb'].to_list()
        y_list =  df.label.to_list()
        # Generate batches
        while True:
            for b in range(batches_per_epoch):
                longest_index = (b + 1) * batch_size - 1
                timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size][-31:], key=len))
                x_train = np.full((batch_size, timesteps, num_features), -99.)
                y_train = np.zeros((batch_size,  1))
                for i in range(batch_size):
                    li = b * batch_size + i
                    x_train[i, 0:len(x_list[li]), :] = x_list[li]
                    y_train[i] = y_list[li]
                yield x_train, y_train