In [4]:

from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
import tensorflow_datasets as tfds
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
#https://github.com/google-research/bert.git
import sys
sys.path.append('./bert')
from tensorflow import keras
import os
import re
from transformers import *
import numpy as np
from tensorflow.python.lib.io import file_io
import pickle
import datetime

# Initialize TPU

In [6]:
# Based on -> https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca
# Active TPU's
TPU_ADDRESS = "node-2"
TPU_ZONE = "us-central1-f"
USE_TPU =True
NUM_TPU_CORES = 8

#tf.config.experimental_connect_to_cluster(resolver)
#tf.tpu.experimental.initialize_tpu_system(resolver)
#tpu_strategy = tf.distribute.experimental.TPUStrategy(resolver)
#print("All devices: ", tf.config.experimental.list_logical_devices('TPU'))

# Setup TPU related config
#tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
#NUM_TPU_CORES = 8

#https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb#scrollTo=191zq3ZErihP
#with tf.Session(TPU_ADDRESS) as session:
   # print('TPU devices:')
   # pprint.pprint(session.list_devices())    
    #contrib.cloud.configure_gcs(session)

INFO:tensorflow:Initializing the TPU system: node-2


KeyboardInterrupt: 

# Setting output directory

In [None]:
def getDir(bucket, output_dir):
    return 'gs://{}/{}'.format(bucket, output_dir)

In [None]:
OUTPUT_DIR = "bertResults"
DO_DELETE = False
USE_BUCKET =True
BUCKET = "patents-research"

if USE_BUCKET:
    OUTPUT_DIR = getDir(BUCKET, OUTPUT_DIR)


if DO_DELETE:
    try:
        tf.gfile.DeleteRecursively(OUTPUT_DIR)
    except:
        # doesn't matter if the directory didn't exist
        pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


# Load Data Set

We classify high impact patents following (Ahuja and Lampert, 2001) as the patents
 within a given cohort receiving the most citations by other patents within the following 5 year window. Thereafter, for every year we sorted
the patents applied for in that year on the basis
of their citation weights and identified the top 1
percent of patents for that year as breakthrough
inventions. This procedure ensures that each patent is compared in its importance only to other
patents of the same yea
https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca

In [None]:
def loadData(gsPath):
    return pd.read_csv(gsPath, sep = "\t")

def generateLable(dataset):
    # convert to datetime
    dataset['publication_date'] = pd.to_datetime(dataset['publication_date'], errors="coerce",format="%Y-%m-%d")
    dataset = dataset.sort_values('publication_date', ascending = False)

    #drop if date is NaN - only one 1082-03-15
    dataset = dataset[dataset.publication_date.isnull() == False]
    
    # calculate the top 1% by publication date - give it label 1
    top1_perc =  dataset.groupby(dataset.publication_date.dt.year)["fwrdCitations_5"].transform(lambda x: x.quantile(.99))
    dataset["label"] = dataset["fwrdCitations_5"] >= top1_perc
    
    # calculate top 5% by publication date - give it label 2
    top5_perc = dataset.groupby(dataset.publication_date.dt.year)["fwrdCitations_5"].transform(lambda x: x.quantile(.95))
    dataset["label"] = np.where(np.logical_and(dataset["fwrdCitations_5"] >= top5_perc, dataset["label"]==0), 2, dataset["label"])
    
    return dataset

def saveToGloud(path,data):
    '''Saves to gcloud so we dont have to do this long ass step every time'''
    with file_io.FileIO(path, mode='w') as f:
        pickle.dump(data,f)


def readTFRecord(path):
    with file_io.FileIO(path, mode='rb') as f:
        return pickle.load(f)
        


In [2]:
# The path to the data
DATA_PATH = "gs://patents-research/patent_research/data_frwdcorrect.tsv"
TRAIN_DF_PATH= "gs://patents-research/patent_research/{}".format("bert_train_df.tsv")
TEST_DF_PATH="gs://patents-research/patent_research/{}".format("bert_test_df.tsv")
DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1, 2] 


print(f'Loading data!')
dataset = loadData(DATA_PATH)
print(f'Finised loading data!')
dataset = generateLable(dataset)
print(f'Test/Train Split!')
train,test=train_test_split(dataset, test_size=0.2)
print(f'Finished Test/Train Split!')

print('Saving Test/Train Split to gCloud')
saveToGloud(TRAIN_DF_PATH,train,isPandas=True)
saveToGloud(TEST_DF_PATH,test,isPandas=True)
print('Finished Saving Test/Train Split to gCloud!')
del dataset

***** Model output directory: gs://patents-research/bertResults *****


# Data Preprocessing

Converting data into format that bert understands. We use https://github.com/allenai/scibert instead of the standard tokenizer/trained model ref= https://www.aclweb.org/anthology/D19-1371.pdf

In [None]:
#HUB_MODULE = "https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/tensorflow_models/scibert_scivocab_uncased.tar.gz"
TRAIN_TFRecord_PATH= "gs://patents-research/patent_research/{}".format("train_features.pickle")
TEST_TFRecord_PATH= "gs://patents-research/patent_research/{}".format("test_features.pickle")
HUB_MODULE = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128    

In [None]:
print("Use the InputExample class from BERT's run_classifier code to create examples from the data")
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)
print("Finished using  InputExample class from BERT's run_classifier code to create examples from the data")


print("Convert our train and test features to InputFeatures that BERT understands")
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') # get scientific tokenizer + pointer to the model

train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
saveToGloud(TRAIN_TFRecord_PATH,train_features)

test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
saveToGloud(TEST_TFRecord_PATH,test_features) 

train_features = readTFRecord(TRAIN_TFRecord_PATH)
test_features =readTFRecord(TEST_TFRecord_PATH)

print("Finsed converting  train and test features to InputFeatures that BERT understands")

# garbage collection
del train_InputExamples
del test_InputExamples
del train
del test

# Creating a model

In [None]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels, dropout):
    """Creates a classification model."""

    bert_module = hub.Module(
      "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
      trainable=True)
    bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
    bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]

    hidden_size = output_layer.shape[-1].value

    # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
    initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):

        # Dropout helps prevent overfitting
        output_layer = tf.nn.dropout(output_layer, keep_prob=dropout)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        # If we're predicting, we want predicted labels and the probabiltiies.
        if is_predicting:
            return (predicted_labels, log_probs)

        # If we're train/eval, compute loss between predicted and actual label
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, predicted_labels, log_probs)
    


In [None]:
# -*- coding: utf-8 -*-
#https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/config/experimental/list_logical_devices
 
    #TPUEstimatorSpec
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,num_warmup_steps, dropout = 0.9, use_tpu=False):
    """Returns `model_fn` closure for TPUEstimator."""
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""
        spec = {True:tf.estimator.tpu.TPUEstimatorSpec, False:tf.estimator.EstimatorSpec }
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
        # TRAIN and EVAL
        if not is_predicting:
            (loss, predicted_labels, log_probs) = create_model(
              is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels, dropout)

            train_op = bert.optimization.create_optimizer(
                  loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=use_tpu)
          # Calculate evaluation metrics. 
            def metric_fn(label_ids, predicted_labels):
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                f1_score = tf.contrib.metrics.f1_score(
                    label_ids,
                    predicted_labels)
                auc = tf.metrics.auc(
                    label_ids,
                    predicted_labels)
                recall = tf.metrics.recall(
                    label_ids,
                    predicted_labels)
                precision = tf.metrics.precision(
                    label_ids,
                    predicted_labels) 
                true_pos = tf.metrics.true_positives(
                    label_ids,
                    predicted_labels)
                true_neg = tf.metrics.true_negatives(
                    label_ids,
                    predicted_labels)   
                false_pos = tf.metrics.false_positives(
                    label_ids,
                    predicted_labels)  
                false_neg = tf.metrics.false_negatives(
                    label_ids,
                    predicted_labels)
                return {
                    "eval_accuracy": accuracy,
                    "f1_score": f1_score,
                    "auc": auc,
                    "precision": precision,
                    "recall": recall,
                    "true_positives": true_pos,
                    "true_negatives": true_neg,
                    "false_positives": false_pos,
                    "false_negatives": false_neg
                }  
            eval_metrics = metric_fn(label_ids, predicted_labels)

            if mode == tf.estimator.ModeKeys.TRAIN:
                return spec[use_tpu](mode=mode,loss=loss,train_op=train_op)
            else:
                return spec[use_tpu](mode=mode,loss=loss,eval_metric_ops=eval_metrics)                
        else:
            (predicted_labels, log_probs) = create_model(
            is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

            predictions = {
                    'probabilities': log_probs,
                    'labels': predicted_labels
              }
            return spec[use_tpu](mode, predictions=predictions)

      # Return the actual model function in the closure
    return model_fn

# MODEL PARAMETERS

In [None]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
EVAL_BATCH_SIZE = NUM_TPU_CORES
PREDICT_BATCH_SIZE = NUM_TPU_CORES
if USE_TPU:#When training a model with multiple GPUs, you can use the extra computing power effectively by increasing the batch size. 
    BATCH_SIZE*=NUM_TPU_CORES
    
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
DROPOUT_KEEP_PROB = .7
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 100



In [None]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

Tpu related configs: Based on https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb#scrollTo=pYVYULZiKvUi

In [None]:
# Setupt TPU related config
tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_ADDRESS, zone=TPU_ZONE)
tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver)

ITERATIONS_PER_LOOP = 1000
# Force TF Hub writes to the GS bucket we provide.
os.environ['TFHUB_CACHE_DIR'] =  OUTPUT_DIR





In [None]:
MODEL_FN = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  dropout = DROPOUT_KEEP_PROB,
  use_tpu = USE_TPU
)

# Functions to Train + Evaluate Model

In [None]:
def get_run_config(output_dir):
    """
    Used for run configuration when TPU used
    """
    return tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=output_dir,
        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=ITERATIONS_PER_LOOP,
            num_shards=NUM_TPU_CORES,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

def getEstimator():
    """
    Returns the estimator used to train/eval model
    """
    if USE_TPU:
        estimator = tf.estimator.tpu.TPUEstimator(
          use_tpu=USE_TPU,
          model_fn=MODEL_FN,
          config=get_run_config(OUTPUT_DIR),
          train_batch_size=BATCH_SIZE,
          eval_batch_size=EVAL_BATCH_SIZE,
          predict_batch_size=PREDICT_BATCH_SIZE,
        )
    else: 
        run_config = tf.estimator.RunConfig(
            model_dir=OUTPUT_DIR,
            save_summary_steps=SAVE_SUMMARY_STEPS,
            save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)
    
        estimator = tf.estimator.Estimator(
          model_fn=MODEL_FN,
          config=run_config,
          params={"batch_size": BATCH_SIZE})
    return estimator

def model_train(estimator):
    """
    Trains the model, rt only good for TPU
    """
    #Set drop_remainder =True to fix a TPU error
    #https://stackoverflow.com/questions/58029896/bert-fine-tuning-with-estimators-on-tpus-on-colab-typeerror-unsupported-operand

    print(f'Beginning Training!')
    current_time = datetime.datetime.now()
    train_input_fn = bert.run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=USE_TPU)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    print("Training took time ", datetime.now() - current_time)

def model_evaluate(estimator):
    """
    Evaluates the model
    """
    print('***** Started evaluation at {} *****'.format(datetime.datetime.now()))

    eval_steps = int(len(test_features) / EVAL_BATCH_SIZE)
    
    eval_input_fn = run_classifier.input_fn_builder(
        features=test_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=True)
    result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
    print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))

    output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
    with tf.gfile.GFile(output_eval_file, "w") as writer:
        print("***** Eval results *****")
        for key in sorted(results.keys()):
            print('  {} = {}'.format(key, str(result[key])))
            writer.write("%s = %s\n" % (key, str(result[key])))
            

# Train + Eval model

In [None]:
estimator = getEstimator()
model_train(estimator)
model_evaluate(estimator)