<a href="https://colab.research.google.com/github/hossein20s/BERT-Stock-Prediction-Using-NLP/blob/master/BERTlib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
VERSION = 'Allen1.0' #@param {type:"string"}

print('BERTlib Version ' + VERSION)

In [0]:
# the following should be defined in the caller pythin file
#DATA_DIR
#TASK
#START_FROM_PRETRAINED

import os

TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
# NUM_TRAIN_EPOCHS = 3.0
MAX_SEQ_LENGTH = 128 # or 400
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 500 # or 1000
NUM_TPU_CORES = 8
ITERATIONS_PER_LOOP = 1000

ESTIMATOR_DIR = DATA_DIR + '/data4estimator/'
os.makedirs(ESTIMATOR_DIR, exist_ok=True)
RESULT_DIR = DATA_DIR + '/results/'
os.makedirs(RESULT_DIR, exist_ok=True)
INPUT_DATA_DIR = DATA_DIR + '/inputData/'
os.makedirs(INPUT_DATA_DIR, exist_ok=True)


In [0]:
import os

# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'uncased_L-12_H-768_A-12' #@param {type:"string"}
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1'

#BERT_PRETRAINED_DIR = 'gs://medicalblockchain_dev/bert-checkpoints/models/SciBert'
BERT_PRETRAINED_DIR = 'gs://cloud-tpu-checkpoints/bert/' + BERT_MODEL 
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))

CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt') 

In [0]:
import tensorflow as tf
import os
from google.colab import auth
import pprint
import json

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

print('TPU address is', TPU_ADDRESS)
print(tf.__version__)

auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

In [0]:
import sys

SRC_DIR='/content/gdrive/My Drive/src/'
REPO_DIR=SRC_DIR +  '/bert_repo/'
if not REPO_DIR in sys.path:
  sys.path.append(REPO_DIR)
  
###Delete all flags before declare#####

def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)


import run_classifier
#import importlib
#importlib.reload(run_classifier)
#print('reloading .....')


processors = {
  "cola": run_classifier.ColaProcessor,
  "mnli": run_classifier.MnliProcessor,
  "mrpc": run_classifier.MrpcProcessor,
}
processor = processors[TASK.lower()]()
label_list = processor.get_labels()


In [0]:
#OUTPUT_DIR = OUTPUT_DIR_TFHUB.replace('bert-tfhub', 'bert-checkpoints')


if START_FROM_PRETRAINED:
  INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
else:
  INIT_CHECKPOINT=MODEL_INPUT_DIR

tf.gfile.MakeDirs(MODEL_OUTPUT_DIR)

import modeling
bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)

import tokenization
import run_classifier_with_tfhub
#tokenizer = run_classifier_with_tfhub.create_tokenizer_from_hub_module(BERT_MODEL_HUB)
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=True) # causing problem
train_examples = processor.get_train_examples(ESTIMATOR_DIR)
num_train_steps = int(len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)  # We'll set sequences to be at most 128 tokens long.

print('loading model from checkpoint directoryy {}', INIT_CHECKPOINT)

#import run_classifier_with_tfhub
#model_fn = run_classifier_with_tfhub.model_fn_builder(
#  bert_hub_module_handle=BERT_MODEL_HUB
import run_classifier
model_fn = run_classifier.model_fn_builder(
  num_labels=len(label_list),
#import run_pretraining
#model_fn = run_pretraining.model_fn_builder( # not sure what is this class for
  bert_config=bert_config,
  init_checkpoint=INIT_CHECKPOINT,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=True,
  use_one_hot_embeddings=True)


In [0]:
import datetime
import run_classifier
from IPython.display import clear_output

import time

# Train the model
def model_train(estimator):
  print('MRPC/CoLA on BERT base model normally takes about 2-3 minutes. Please wait...')
  # Compute number of train and warmup steps from batch size
  
  train_features = run_classifier.convert_examples_to_features(
      train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  start = time.time()
  print('***** Started training at {} *****'.format(datetime.datetime.now()))
  tf.logging.info("  Num steps = %d", num_train_steps)
  train_input_fn = run_classifier.input_fn_builder(
      features=train_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=True,
      drop_remainder=True)
  estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
  #clear_output()
  end = time.time()
  print('  Num examples = {}'.format(len(train_examples)))
  print('  Batch size = {} and TASK = {}'.format(TRAIN_BATCH_SIZE,TASK))
  print('***** Finished training at {} takes {} seconds *****'.format(datetime.datetime.now(),end-start))


def model_eval(estimator):
  # Eval the model.
  eval_examples = processor.get_dev_examples(ESTIMATOR_DIR)
  eval_features = run_classifier.convert_examples_to_features(
      eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  start = time.time()
  print('***** Started evaluation at {} *****'.format(datetime.datetime.now()))

  # Eval will be slightly WRONG on the TPU because it will truncate
  # the last batch.
  eval_steps = int(len(eval_examples) / EVAL_BATCH_SIZE)
  eval_input_fn = run_classifier.input_fn_builder(
      features=eval_features,
      seq_length=MAX_SEQ_LENGTH,
      is_training=False,
      drop_remainder=True)
  result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
  #clear_output()
  end = time.time()
  print('  Num examples = {}'.format(len(eval_examples)))
  print('  Batch size = {}'.format(EVAL_BATCH_SIZE))
  print('***** Finished evaluation at {} takes {} seconds *****'.format(datetime.datetime.now(),end-start))
  timestr = time.strftime("%Y%m%d-%H%M%S")
  output_eval_file = os.path.join(MODEL_OUTPUT_DIR, "eval_results." + timestr + ".txt")
  with tf.gfile.GFile(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
      print('  {} = {}'.format(key, str(result[key])))
      writer.write("%s = %s\n" % (key, str(result[key])))
      
def model_predict(estimator, fileName):
    use_tpu = True

    start0 = time.time()

    #predict_examples = processor.get_test_examples(DATA_DIR)
    inputFile = os.path.join(INPUT_DATA_DIR, fileName + ".tsv")
    print('reading from {}'.format(inputFile))
    predict_examples = processor._create_examples(processor._read_tsv(inputFile), "test")
    num_actual_predict_examples = len(predict_examples)
    if use_tpu:
      # TPU requires a fixed batch size for all batches, therefore the number
      # of examples must be a multiple of the batch size, or else examples
      # will get dropped. So we pad with fake examples which are ignored
      # later on.
      while len(predict_examples) % PREDICT_BATCH_SIZE != 0:
        predict_examples.append(run_classifier.PaddingInputExample())

    timestr = time.strftime("%Y%m%d-%H%M%S")
    predict_file = os.path.join(MODEL_OUTPUT_DIR, "predict.tf_record." + fileName + '.' + timestr)
    run_classifier.file_based_convert_examples_to_features(predict_examples, 
                                                           label_list,
                                            MAX_SEQ_LENGTH, tokenizer,
                                            predict_file)


    start = time.time()
    print('** Convert examples to features took {} seconds'.format(datetime.datetime.now(),start-start0))
    print("***** Running prediction*****")

    predict_drop_remainder = True if use_tpu else False
    predict_input_fn = run_classifier.file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=predict_drop_remainder)

    result = estimator.predict(input_fn=predict_input_fn)

    output_predict_file = os.path.join(RESULT_DIR, "predict_results." + fileName + ".tsv")
    output_predict_file1 = os.path.join(MODEL_OUTPUT_DIR, "predict_results." + fileName + ".tsv")
    with tf.gfile.GFile(output_predict_file, "w") as writer, tf.gfile.GFile(output_predict_file1, "w") as writer1:
      #clear_output()
      num_written_lines = 0
      print("***** Predict results *****")
      for (i, prediction) in enumerate(result):
        probabilities = prediction["probabilities"]
        if i >= num_actual_predict_examples:
          break
        output_line = "\t".join(
            str(class_probability)
            for class_probability in probabilities) + "\n"
        writer.write(output_line)
        writer1.write(output_line)
        num_written_lines += 1
    assert num_written_lines == num_actual_predict_examples  
    end = time.time()
    print("  Num examples = %d (%d actual, %d padding)",
                    len(predict_examples), num_actual_predict_examples,
                    len(predict_examples) - num_actual_predict_examples)
    print("  Batch size = %d", PREDICT_BATCH_SIZE)
    print('***** Finished testing at {} takes {} seconds *****'.format(datetime.datetime.now(),end-start))
  


In [0]:
# Setup TPU related config
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

def get_run_config(output_dir):
  return tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=output_dir,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    save_summary_steps = SAVE_SUMMARY_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))


estimator = tf.contrib.tpu.TPUEstimator(
  use_tpu=True,
  model_fn=model_fn,
  config=get_run_config(MODEL_OUTPUT_DIR),
  train_batch_size=TRAIN_BATCH_SIZE,
  eval_batch_size=EVAL_BATCH_SIZE,
  predict_batch_size=PREDICT_BATCH_SIZE,
)