In [0]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [0]:
%load_ext tensorboard

In [0]:
%tensorboard --logdir /content/gdrive/My\ Drive/Puc/Projeto\ Final/models/bert/E4

In [0]:
cp -R BERT /content/gdrive/My\ Drive/Puc/Projeto\ Final/models/bert/M1

# Header

## Install

In [0]:
from google.colab import drive
drive.mount('/content/gdrive/')

!pip install bert-tensorflow
!pip install git+https://github.com/guillaumegenthial/tf_metrics.git
  
%load_ext tensorboard

## Import

In [0]:
import tensorflow as tf

import pandas as pd
from tqdm import tqdm
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

from datetime import datetime
import tensorflow_hub as hub
import shutil
import sys
import pickle
import tf_metrics
import random
import os
from bert.optimization import AdamWeightDecayOptimizer

## Parameters

In [0]:
hooks = []
debug = {}

root = r"/content/gdrive/My Drive/Puc/Projeto Final"
data_folder = f"{root}/Datasets/finetuning/train/"
OUTPUT_DIR = "BERT" # f"{root}/models/bert/multi_cased_devel"

# BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"
# Compute train and warmup steps from batch size
BATCH_SIZE = 4
MAX_EXAMPLES = 20000
LEARNING_RATE = 2e-5
# Warmup is a period of time where hte learning rate
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100
MAX_SEQ_LEN = 512
num_classes = 768
pos_indices = list(range(1, num_classes))
average = 'micro'
features_path = sys.argv[-1]
MAX_CLASSES = 120
NON_ROOT_WEIGHT = 3
min_loss = 1000
SPLIT_SIZE = 9
RANDOM_START = int(random.uniform(0, SPLIT_SIZE))

tf.logging.set_verbosity(tf.logging.INFO)
#shutil.rmtree(OUTPUT_DIR, ignore_errors=True)

# Functions

In [0]:
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  """Creates an optimizer training op."""
  global_step = tf.train.get_or_create_global_step()

  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

  # Implements linear decay of the learning rate.
  learning_rate = tf.train.polynomial_decay(
      learning_rate,
      global_step,
      num_train_steps,
      end_learning_rate=0.0,
      power=1.0,
      cycle=False)

  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
  # learning rate will be `global_step/num_warmup_steps * init_lr`.
  if num_warmup_steps:
    global_steps_int = tf.cast(global_step, tf.int32)
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = init_lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = (
        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

  # It is recommended that you use this optimizer for fine tuning, since this
  # is how the model was trained (note that the Adam m/v variables are NOT
  # loaded from init_checkpoint.)
  optimizer = AdamWeightDecayOptimizer(
      learning_rate=learning_rate,
      weight_decay_rate=0.01,
      beta_1=0.9,
      beta_2=0.999,
      epsilon=1e-6,
      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

  if use_tpu:
    optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

  tvars = tf.trainable_variables()
  grads = tf.gradients(loss, tvars)

  # This is how the model was pre-trained.
  (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

  train_op = optimizer.apply_gradients(
      zip(grads, tvars), global_step=global_step)

  # Normally the global step update is done inside of `apply_gradients`.
  # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
  # a different optimizer, you should probably take this line out.
  new_global_step = global_step + 1
  train_op = tf.group(train_op, [global_step.assign(new_global_step)])
  return train_op, optimizer

def build_estimator(labels):
    # Compute # train and warmup steps from batch size
    total_samples = sum([len(x) for x in labels])
    num_train_steps = int(total_samples * NUM_TRAIN_EPOCHS / BATCH_SIZE)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
    # Create an input function for training. drop_remainder = True for using TPUs.

    # Specify outpit directory and number of checkpoint steps to save
    run_config = tf.estimator.RunConfig(
        model_dir=OUTPUT_DIR,
        save_summary_steps=SAVE_SUMMARY_STEPS,
        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

    model_fn = model_fn_builder(
        num_labels=0,
        learning_rate=LEARNING_RATE,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        params={"batch_size": BATCH_SIZE})

    return estimator


def load_features(path, max_size=20000):
    labels = []
    tokens = []
    ids = []

    if os.path.isfile("all_features.dmp"):
        with open("all_features.dmp", "rb")as f:
            labels, tokens, ids = pickle.load(f)
    else:
        for file_name in tqdm(sorted(list(os.listdir(path)))):
            df = pd.read_csv(f"{path}/{file_name}", header=None, names=list(range(MAX_SEQ_LEN)), index_col=None)

            if file_name.endswith("y"):
                # Changing -1 and 0
                df = df.apply(lambda row: [-(x + 1) if x <= 0 else x for x in row])
                # Moving all 1 to get all positive
                labels.append(df + 1)
            elif file_name.endswith("x1"):
                tokens.append(df)
            elif file_name.endswith("x2"):
                ids.append(df)

            if len(tokens) >= max_size and \
                    len(labels) >= max_size and \
                    len(ids) >= max_size:
                break

        with open("all_features.dmp", "wb") as f:
            pickle.dump((labels, tokens, ids), f)

    masks = [k.sum(axis=1).values > 0 for k in ids]
    for i in range(len(masks)):
        labels[i] = labels[i][masks[i]]
        tokens[i] = tokens[i][masks[i]]
        ids[i] = ids[i][masks[i]]

    labels = [labels[i] for i in range(len(masks)) if len(labels[i]) > 0]
    tokens = [tokens[i] for i in range(len(masks)) if len(tokens[i]) > 0]
    ids = [ids[i] for i in range(len(masks)) if len(ids[i]) > 0]

    return tokens, ids, labels


def slice_input(epoch, o_tokens, o_ids, o_labels, train=True):
    print(f"Slicing {epoch + RANDOM_START}")
    labels = []
    tokens = []
    ids = []

    def slice(df):
        full = len(df) / SPLIT_SIZE
        chunk = int((epoch + RANDOM_START) % full)
        # Doing a 80-20 train-test split
        slice_start = chunk * SPLIT_SIZE
        slice_end = slice_start + (SPLIT_SIZE - 1)

        if not train:
            slice_start = slice_end
            slice_end = slice_start + 1

        return df.iloc[slice_start:slice_end, :]

    for i in range(len(o_tokens)):
        labels.append(slice(o_labels[i]))
        tokens.append(slice(o_tokens[i]))
        ids.append(slice(o_ids[i]))

    def fix_it(df):
        # print(f"list values len {len(df)}")
        # print(f"shape 0 {df[0].shape}")
        df = pd.concat(df)
        # print(f"New df:{df.shape}")
        return df.fillna(0).values.astype('int32')

    sliced_labels = fix_it(labels)
    sliced_tokens = fix_it(tokens)
    sliced_ids = fix_it(ids)
    sliced_mask = (sliced_ids > 0).astype('int32')

    sliced_loss_weight = (sliced_ids > 1) * (NON_ROOT_WEIGHT - 1)  # Will add 1 in row above
    sliced_loss_weight += sliced_mask

    print("Sliced")
    return input_fn_builder(sliced_tokens, sliced_ids, sliced_labels, sliced_mask, train, False, sliced_loss_weight)


def input_fn_builder(tokens, ids, labels, mask, is_training, drop_remainder, loss_weight):
    """Creates an `input_fn` closure to be passed to TPUEstimator."""

    # print(f"Tokens shape:{tokens.shape}")

    num_examples, seq_length = tokens.shape

    # print(f"t {num_examples}, {seq_length}")

    all_input_ids = tokens
    all_input_mask = mask
    all_segment_ids = ids
    all_label_ids = labels

    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        # This is for demo purposes and does NOT scale to large data sets. We do
        # not use Dataset.from_generator() because that uses tf.py_func which is
        # not TPU compatible. The right way to load data is with TFRecordReader.
        d = tf.data.Dataset.from_tensor_slices({
            "input_ids": all_input_ids,
            "input_mask": all_input_mask,
            "segment_ids": all_segment_ids,
            "label_ids": all_label_ids,
            "loss_weight": loss_weight
        })

        if is_training:
            # d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
        d = d.prefetch(batch_size)
        return d

    return input_fn


def create_model(is_predicting, input_ids, input_mask, segment_ids, labels, loss_weight):
    """Creates a classification model."""

    bert_module = hub.Module(
        BERT_MODEL_HUB,
        trainable=True)
    bert_inputs = dict(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids)
    bert_outputs = bert_module(
        inputs=bert_inputs,
        signature="tokens",
        as_dict=True)

    one_hot_labels = tf.one_hot(labels, MAX_CLASSES, name="My_OneHot")

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["sequence_output"]
    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [MAX_CLASSES, hidden_size], 
        initializer=tf.truncated_normal_initializer(mean=0.0, stddev=1))

    output_bias = tf.get_variable(
        "output_bias", [MAX_SEQ_LEN, MAX_CLASSES], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        output_layer_2 = tf.matmul(output_layer, output_weights, transpose_b=True)
        output_layer_2 = tf.add(output_layer_2, output_bias)
        # output_layer_2 = tf.layers.batch_normalization(output_layer_2, training=(not is_predicting))
        # output_layer_2 = tf.math.sigmoid(output_layer_2, name="My_output_layer")
        output_layer_2 = tf.nn.relu(output_layer_2, name="My_output_layer")
        
        # Dropout helps prevent overfitting
        # output_layer = tf.nn.dropout(output_layer, keep_prob=0.9, name="My_Dropout")
        # print(f"1 output_layer shape:{output_layer.shape}")

        # output_layer = tf.nn.softmax(output_layer, name="My Softmax")
        # print(f"2 output_layer shape:{output_layer.shape}")

        # tmp_mask = tf.broadcast_to(tf.expand_dims(input_mask, axis=-1), tf.shape(output_layer))
        # output_layer = tf.math.multiply(output_layer, tf.cast(tmp_mask, tf.float32), name="My_Argmax_Mask")

        # softmax = tf.nn.softmax(output_layer_2, name="My_Softmax")
        argmax = tf.math.argmax(output_layer_2, axis=-1, name="My_Argmax")

        predicted_labels = tf.math.multiply(argmax, tf.cast(input_mask, tf.int64), name="d")

        hooks.append(
            tf.estimator.LoggingTensorHook({
                # "softmax": softmax[0][0],
                # "label": labels[0],
                # "argmax": argmax[0][0],
                # "segment_ids": segment_ids[0],
                # "output_layer": output_layer[0][0],
                "predicted": predicted_labels[0],
                "output_layer_2": output_layer_2[0][0],
                "output_weights": output_weights[0],
                "output_bias": output_bias[0],
                # "OK labels": tf.math.reduce_sum(
                #    tf.to_float(tf.math.equal(tf.cast(labels, tf.int64), predicted_labels))),
                # "Total Labels": tf.shape(labels),
                # "softmax shape": tf.shape(softmax),
                # "argmax shape": tf.shape(argmax),
                # "predicted_labels shape": tf.shape(predicted_labels),
                # "input_mask shape": tf.shape(input_mask)
            }, every_n_iter=10))

        if is_predicting:
            return predicted_labels

        # output_layer = tf.reshape(output_layer, [BATCH_SIZE ,MAX_SEQ_LEN, 768])
        # print(f"logits : {output_layer.shape}")
        # print(f"tmp : {tmp.shape}")

        loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=one_hot_labels, logits=output_layer_2, name="My_softmax_loss")
        no_loss = tf.multiply(loss, tf.cast(input_mask, tf.float32), name="My_masked_loss")
        weightened = tf.divide(no_loss, tf.cast(tf.reduce_sum(input_mask), tf.float32), name="My_wloss")
        rm = tf.reduce_sum(weightened, name="My_final_loss")

        hooks.append(
            tf.estimator.LoggingTensorHook({
                # "count no_loss": tf.math.count_nonzero(loss),
                # "count input_mask": tf.math.count_nonzero(input_mask),
                # "loss shape": tf.shape(loss),
                # "rm shape": tf.shape(rm),
                # "no_loss shape": tf.shape(no_loss),
                # "one_hot_labels shape": tf.shape(one_hot_labels),
                "no_loss": no_loss[0],
                # "weightened": weightened[0],
                "loss_weight": loss_weight[0],
                "final_loss": rm
            }, every_n_iter=10))

        return rm, predicted_labels


# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
    """Returns `model_fn` closure for TPUEstimator."""

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]
        loss_weight = features["loss_weight"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)

        # TRAIN and EVAL
        if not is_predicting:

            (loss, predicted_labels) = create_model(
                is_predicting, input_ids, input_mask, segment_ids, label_ids, loss_weight)
            
            train_op, optimizer = create_optimizer(
                loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

            hooks.clear()
            hooks.append(tf.train.LoggingTensorHook({"loss": loss, "lr":optimizer.learning_rate}, every_n_iter=10))

            # Calculate evaluation metrics.
            def metric_fn(label_ids, predicted_labels):
                y_true, y_pred = label_ids, predicted_labels

                precision = tf_metrics.precision(
                    y_true, y_pred, num_classes, pos_indices, average=average)
                recall = tf_metrics.recall(
                    y_true, y_pred, num_classes, pos_indices, average=average)
                f2 = tf_metrics.fbeta(
                    y_true, y_pred, num_classes, pos_indices, average=average, beta=2)
                f1 = tf_metrics.f1(
                    y_true, y_pred, num_classes, pos_indices, average=average)

                return {
                    "precision": precision,
                    "recall": recall,
                    "f2": f2,
                    "f1": f1
                }

            eval_metrics = metric_fn(label_ids, predicted_labels)

            if mode == tf.estimator.ModeKeys.TRAIN:

                return tf.estimator.EstimatorSpec(mode=mode,
                                                  loss=loss,
                                                  train_op=train_op,
                                                  training_hooks=hooks)
            else:
                return tf.estimator.EstimatorSpec(mode=mode,
                                                  loss=loss,
                                                  eval_metric_ops=eval_metrics)
        else:
            (predicted_labels, log_probs) = create_model(
                is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

            predictions = {
                'probabilities': log_probs,
                'labels': predicted_labels
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # Return the actual model function in the closure
    return model_fn

# Main

In [0]:
current_time = datetime.now()

print("Loading Features")
tokens, ids, labels = load_features(data_folder, MAX_EXAMPLES)

NUM_TRAIN_EPOCHS = int(max([len(x) for x in labels])/SPLIT_SIZE)
print(f'Beginning Training! {NUM_TRAIN_EPOCHS}')

estimator = build_estimator(labels)

seq_epochs = list(range(NUM_TRAIN_EPOCHS))
random.shuffle(seq_epochs)

for i,epoch in enumerate(seq_epochs):
    print(f"New Epoch {i}")
    train_input_fn = slice_input(epoch, tokens, ids, labels, True)
    eval_input_fn = slice_input(epoch, tokens, ids, labels, False)

    #print(f"hooks:{len(hooks)}")
    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    # estimator.train(input_fn=slice_input(epoch, tokens, ids, labels, True), hooks=hooks)
    hooks.clear()
    
print("Training took time ", datetime.now() - current_time)

In [0]:
%tensorboard --logdir BERT

In [0]:
%tensorboard --logdir new:BERT,old:/content/gdrive/My\ Drive/Puc/Projeto\ Final/models/bert/train/

In [0]:
!cp -R BERT/ /content/gdrive/My\ Drive/Puc/Projeto\ Final/models/bert/train/

In [0]:
!cp -R BERT/* /content/gdrive/My\ Drive/Puc/Projeto\ Final/models/bert/train/