In [None]:
from Config import Config
from Dataset import Dataset
from BertFeatures import GenerateBertFeatures
from BertClasiffier import BERTClasiffier
from BertOutputEmbeddings import getEmbeddings
from LSTM import LSTM
from LSTMInputGenerator import Generator

# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"


#Define Config
config = Config()

#Get datasets
data = Dataset(config)
data.GetTrainTestData()

#Convert text data to BERT features
input = GenerateBertFeatures(data, config)
input.GetFeatures(BERT_MODEL_HUB)

#train fine-tuned BERT
classifier = BERTClasiffier(data, config, input)
classifier.train(BERT_MODEL_HUB)

#evaluate accuracy of fine-tuned BERT on validation set
eval_metrics = classifier.evaluate()
print(eval_metrics)

tf.compat.v1.logging.set_verbosity(tf.logging.ERROR)
#extract embeddings for training dataset
df_trn = getEmbeddings(data.df_train, data.train_df, data.trainData, data.index_l, classifier)
#extract embeddings for validation dataset
df_val = getEmbeddings(data.df_test, data.test_df, data.trainData, data.val_index_l, classifier)

#split validation set into validation set and test set for LSTM
#validation set will be used while training LSTM to optimize model
#test set will be used as unseen dataset for evaluating performance of LSTM
df_val, df_test = train_test_split(df_val, test_size=config.testSizeLSTM, random_state=35)

#train LSTM, evaluate performance and get predicted probabilities
model = LSTM(config)
model.buildModel()

gen = Generator(config)

model.train(gen.train_generator(df_trn), gen.val_generator(df_val))
evaluation_metrics = model.evaluate(gen.test_generator(df_test), df_test)
test_predictions = model.predict(gen.test_generator(df_test))

#get max probability (and thus the most likely industry) for every report
predictions = []
prediction_prob = []
for value in test_predictions:
  value = list(value)
  predictions.append(value.index(max(value)))
  prediction_prob.append(value)

#form final dataframe and decode the previously label encoded industry labels
df_test["predictions"] = predictions
df_test["prediction_prob"] = prediction_prob
df_test["Actual Industry"] = data.LE.inverse_transform(df_test["label"])
df_test["Predicted Industry"] = data.LE.inverse_transform(df_test["predictions"])

print df_test