In [None]:
#Install bert package for tensorflow v1
!pip install bert-tensorflow==1.0.1
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

from datetime import datetime
import keras
from keras import layers
from keras.callbacks import ReduceLROnPlateau
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tqdm.notebook import tqdm #adds progress bars to show loop status
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

class Dataset():

    """This class defines functions for reading data from csv file, encoding industry labels, splitting report data into training and test set, 
    and splitting reports into chunks of smaller text
    Order of execution from top to bottom -
    GetTrainTestData -> ApplySplit -> TrainTestSplit -> LabelData -> ReadData -> GetTextSplit"""

    def __init__(self, config):
        self.config = config
        self._dataSource = config.dataSource
        self._testSize = config.testSize     

    def ReadData(self, filePath):

        """Read data from csv file. Drop rows where report data is not present"""

        data = pd.read_csv(self._dataSource)
        data.dropna(subset = ["Data"], inplace = True)
        return data

    def LabelData(self):

        """Encode industry labels using label encoding"""

        self.LE = LabelEncoder()
        trainData = self.ReadData(self._dataSource)
        trainData['Industry'] = self.LE.fit_transform(trainData['Industry'])
        return trainData

    def GetTextSplit(self, text1):

        """Function for splitting """

        maxSeqLength = self.config.maxSeqLength
        overlap = self.config.overlap

        l_total = []
        l_parcial = []
        if len(text1.split())//maxSeqLength >0:
            n = len(text1.split())//maxSeqLength
        else: 
            n = 1
        for w in range(n):
            if w == 0:
                l_parcial = text1.split()[:maxSeqLength]
                l_total.append(" ".join(l_parcial))
            else:
                l_parcial = text1.split()[w*(maxSeqLength - overlap):w*(maxSeqLength - overlap) + maxSeqLength]
                l_total.append(" ".join(l_parcial))
        return l_total

    def TrainTestSplit(self):

        """Split data into training and test datasets"""

        self.trainData = self.LabelData()
        df_train, df_test = train_test_split(self.trainData, test_size = self._testSize, random_state = 123)
        return df_train, df_test

    def ApplySplit(self):

        """Function to apply the text split to every report in training and test datasets. Output is a dataframe which has a new column
        corresponding to the list of broken chunks of every report."""

        df_train, df_test = self.TrainTestSplit()
        df_train['text_split'] = df_train["Data"].apply(self.GetTextSplit)
        df_test['text_split'] = df_test["Data"].apply(self.GetTextSplit)
        return df_train, df_test

    def GetTrainTestData(self):

        """Function to break down the list of broken report chunks into separate rows and retrive corresponding industry label for every row"""

        self.df_train, self.df_test = self.ApplySplit()

        train_l = []
        label_l = []
        self.index_l = []
        for idx,row in self.df_train.iterrows():
            for l in row['text_split']:
                train_l.append(l)
                label_l.append(row['Industry'])
                self.index_l.append(idx)

        self.train_df = pd.DataFrame({"text":train_l, "label":label_l})

        val_l = []
        val_label_l = []
        self.val_index_l = []
        for idx,row in self.df_test.iterrows():
            for l in row['text_split']:
                val_l.append(l)
                val_label_l.append(row['Industry'])
                self.val_index_l.append(idx)

        self.test_df = pd.DataFrame({"text":val_l, "label":val_label_l})