# BERT Text Processor Module

In [14]:
class PaddingInputExample:
    """Fake example so the num input examples is a multiple of the batch size.
       When running eval/predict on the TPU, we need to pad the number of examples
       to be a multiple of the batch size, because the TPU requires a fixed batch
       size. 
       
       The alternative is to drop the last batch, which is bad because it means
       the entire output data won't be generated.
       
       We use this class instead of `None` because treating `None` as padding
       batches could cause silent errors.
       
       Won't usually cause issues on CPU\GPU hopefully.
    """
    
    
class InputExample:
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        
        """Constructs a InputExample.
        
        Args:
          guid: Unique id for the example.
          text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
          text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
          label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [15]:
import tensorflow_hub as tf_hub
from tqdm import tqdm
import bert
from bert import tokenization
import numpy as np

class BertTextProcessor:
    
    def __init__(self, tf_session, bert_model_path, max_seq_length=128):
        self.tokenizer = None
        self.bert_path = bert_model_path
        self.tf_sess = tf_session
        self.input_examples = []
        self.max_seq_length = max_seq_length
        self.input_ids = None
        self.input_masks = None
        self.segment_ids = None
        self.labels = None
        
    
    def create_bert_tokenizer(self):
        """Get the vocab file and casing info from 
           BERT tensorflow hub model."""
        print('Loading Base BERT Model')
        bert_model =  tf_hub.Module(self.bert_path)
        tokenization_info = bert_model(signature="tokenization_info", as_dict=True)
        vocab_file, do_lower_case = self.tf_sess.run(
            [
                tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"],
            ]
        )
        print('Loading BERT WordPiece Tokenizer')
        self.tokenizer = bert.tokenization.FullTokenizer(vocab_file=vocab_file, 
                                                         do_lower_case=do_lower_case)
        
    
    def convert_text_to_input_examples(self, texts, labels=[]):
        """Create InputExamples based on instances of the 
           bert.run_classifier.InputExample class"""
        
        labels = labels or [None] * len(texts)
        print('Creating Input Examples from data')
        for text, label in tqdm(zip(texts, labels), desc="Converting text to examples"):
            self.input_examples.append(
                InputExample(guid=None, text_a=text, text_b=None, label=label)
            )
    
    
    def convert_single_example(self, tokenizer, example, max_seq_length):
        """Converts a single example instance of class `InputExample` 
           into a single instance of features which consist of the following
            - input_id
            - input_mask
            - segment_id
            - label (None in case of inference)
           this is based on instances of the `bert.run_classifier.InputFeatures`
           class which is usually generated from the function 
           `bert.run_classifier.convert_single_example()"""
        
        if isinstance(example, PaddingInputExample):
            input_ids = [0] * max_seq_length
            input_mask = [0] * max_seq_length
            segment_ids = [0] * max_seq_length
            label = 0
            return input_ids, input_mask, segment_ids, label

        tokens_a = tokenizer.tokenize(example.text_a)
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0 : (max_seq_length - 2)]

        tokens = []
        segment_ids = []
        
        tokens.append("[CLS]")
        segment_ids.append(0)
        
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
            
        tokens.append("[SEP]")
        segment_ids.append(0)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. 
        # Only real tokens are attended to in the attention layers.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
        
        # double check lengths are alright
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        return input_ids, input_mask, segment_ids, example.label
    
    
    def convert_examples_to_features(self):
        """Convert a set of `InputExample` instancess to a list 
           of instances of`InputFeatures` using the 
           convert_single_example(...) function."""
        
        print('Creating BERT Input Features from Input Examples')
        input_ids, input_masks, segment_ids, labels = [], [], [], []
        for example in tqdm(self.input_examples, desc="Converting examples to features"):
            input_id, input_mask, segment_id, label = self.convert_single_example(
                self.tokenizer, example, self.max_seq_length
            )
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)
            labels.append(label)
            
        self.input_ids = np.array(input_ids)
        self.input_masks = np.array(input_masks)
        self.segment_ids = np.array(segment_ids)
        self.labels = np.array(labels).reshape(-1, 1)

# BERT CVE Classifier Module

In [16]:
import tensorflow as tf
import tensorflow_hub as tf_hub
from tensorflow.keras import backend as K


class BertLayer(tf.keras.layers.Layer):
    
    def __init__(self, bert_model_path, n_fine_tune_encoders=10, **kwargs,):
        
        self.n_fine_tune_encoders = n_fine_tune_encoders
        self.trainable = True
        # change only based on base bert output layer shape
        self.output_size = 768
        self.bert_path = bert_model_path
        super(BertLayer, self).__init__(**kwargs)

        
    def build(self, input_shape):
        print('Loading Base BERT Model')
        self.bert = tf_hub.Module(self.bert_path,
                                  trainable=self.trainable, 
                                  name=f"{self.name}_module")

        # Remove unused layers
        # CLS layers cause an error if you try to tune them
        trainable_vars = self.bert.variables
        trainable_vars = [var for var in trainable_vars 
                                  if not "/cls/" in var.name]
        trainable_layers = ["embeddings", "pooler/dense"]


        # Select how many layers to fine tune
        # we fine-tune all layers per encoder
        # by default we tune all 10 encoders
        for i in range(self.n_fine_tune_encoders+1):
            trainable_layers.append(f"encoder/layer_{str(10 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [var for var in trainable_vars
                                  if any([l in var.name 
                                              for l in trainable_layers])]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:# and 'encoder/layer' not in var.name:
                self._non_trainable_weights.append(var)
        print('Trainable layers:', len(self._trainable_weights))
        print('Non Trainable layers:', len(self._non_trainable_weights))

        super(BertLayer, self).build(input_shape)

        
    def call(self, inputs):
        print('Constructing Base BERT architecture')
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(input_ids=input_ids, 
                           input_mask=input_mask, 
                           segment_ids=segment_ids)
        
        pooled = self.bert(inputs=bert_inputs, 
                           signature="tokens", 
                           as_dict=True)["pooled_output"]

        return pooled

    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size) 

In [17]:
class ModelNotBuiltException(Exception):
    pass

In [18]:
class BERTClassifier:
    
    def __init__(self, bert_model_path, max_seq_length=128, 
                 n_fine_tune_encoders=10, model_weights_path=None):
        self.bert_path = bert_model_path
        self.max_seq_length = max_seq_length
        self.n_fine_tune_encoders = n_fine_tune_encoders
        self.model_estimator = None
        self.model_weights_path = model_weights_path
    
    def build_model_architecture(self): 
        print('Build BERT Classifier CVE Model Architecture')
        inp_id = tf.keras.layers.Input(shape=(self.max_seq_length,), 
                                       name="input_ids")
        inp_mask = tf.keras.layers.Input(shape=(self.max_seq_length,), 
                                         name="input_masks")
        inp_segment = tf.keras.layers.Input(shape=(self.max_seq_length,), 
                                            name="segment_ids")
        bert_inputs = [inp_id, inp_mask, inp_segment]

        bert_output = BertLayer(bert_model_path=self.bert_path, 
                                n_fine_tune_encoders=self.n_fine_tune_encoders)(bert_inputs)

        dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
        pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

        model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
        model.compile(loss='binary_crossentropy', 
                      optimizer=tf.keras.optimizers.Adam(lr=2e-5), 
                      metrics=['accuracy'])    
        self.model_estimator = model
        
    
    def load_model_weights(self, model_weights_path=None):
        print('Loading BERT Classifier CVE Model Weights')
        self.model_weights_path = model_weights_path or self.model_weights_path
        if not self.model_estimator:
            self.build_model_architecture()
        self.model_estimator.load_weights(self.model_weights_path)
            
    
    def get_model(self):
        if not self.model_estimator:
            raise ModelNotBuiltException(
                "BERT Classifier CVE Model doesn't exist. Please build model first")
        else:
            return self.model_estimator



In [19]:
def initialize_vars(tf_session):
    tf_session.run(tf.local_variables_initializer())
    tf_session.run(tf.global_variables_initializer())
    tf_session.run(tf.tables_initializer())
    K.set_session(tf_session)

# GPU inference pipeline

In [20]:
# Params for bert model and tokenization
BERT_PATH = "models/model_assets/gokube-phase2/base_bert_tfhub_models/bert_uncased_L12_H768_A12"
BERT_CVE_MODEL_PATH = "../../saved_models/bert_cve_model_weights_seq512b15.h5"
MAX_SEQ_LENGTH = 512
SEED = 42

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('../../data/GH_complete_labeled_issues_prs - preprocessed.csv', encoding='utf-8', 
                      na_filter=False)
dataset = dataset[dataset.label != 0]
texts = dataset['description'].tolist()
labels = dataset['label'].tolist()
labels = [0 if item == 1 else 1 for item in labels]

train_text, test_text, train_labels, test_labels = train_test_split(texts, labels, 
                                                                    test_size=0.25, 
                                                                    random_state=SEED) 
len(train_text), len(test_text)

(17432, 5811)

In [22]:
import tensorflow as tf

In [23]:
# Initialize session
sess = tf.Session()
initialize_vars(sess)

# process text data
btp = BertTextProcessor(tf_session=sess, 
                        bert_model_path=BERT_PATH, 
                        max_seq_length=MAX_SEQ_LENGTH)
btp.create_bert_tokenizer()
btp.convert_text_to_input_examples(test_text[:100])
btp.convert_examples_to_features()

# load pre-trained classification model
bc = BERTClassifier(bert_model_path=BERT_PATH, 
                    max_seq_length=MAX_SEQ_LENGTH)
bc.build_model_architecture()
bc.load_model_weights(model_weights_path=BERT_CVE_MODEL_PATH)

# perform model inference
test_predictions = bc.model_estimator.predict(x=[btp.input_ids, 
                                                 btp.input_masks, 
                                                 btp.segment_ids],
                                              batch_size=512,
                                              verbose=1)

Loading Base BERT Model


Converting text to examples: 100it [00:00, 267323.39it/s]
Converting examples to features:   0%|          | 0/100 [00:00<?, ?it/s]

Loading BERT WordPiece Tokenizer
Creating Input Examples from data
Creating BERT Input Features from Input Examples


Converting examples to features: 100%|██████████| 100/100 [00:02<00:00, 47.88it/s]


Build BERT Classifier CVE Model Architecture
Loading Base BERT Model
Trainable layers: 199
Non Trainable layers: 5
Constructing Base BERT architecture
Loading BERT Classifier CVE Model Weights


In [14]:
from sklearn.metrics import confusion_matrix, classification_report

test_preds = test_predictions.ravel()
test_preds = [1 if pred > 0.5 else 0 for pred in test_preds]
print('Classification Report:')
print(classification_report(y_true=test_labels[:100], y_pred=test_preds))
print(confusion_matrix(y_true=test_labels[:100], y_pred=test_preds))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        96
           1       1.00      0.75      0.86         4

    accuracy                           0.99       100
   macro avg       0.99      0.88      0.93       100
weighted avg       0.99      0.99      0.99       100

[[96  0]
 [ 1  3]]


# CPU Inference Pipeline

In [11]:
with tf.device('cpu:0'):  
    
    # Initialize session
    sess = tf.Session()
    initialize_vars(sess)
    
    # process text data
    btp = BertTextProcessor(tf_session=sess, 
                            bert_model_path=BERT_PATH, 
                            max_seq_length=MAX_SEQ_LENGTH)
    
    btp.create_bert_tokenizer()
    btp.convert_text_to_input_examples(test_text[:100])
    btp.convert_examples_to_features()
    
    # load pre-trained classification model
    bc = BERTClassifier(bert_model_path=BERT_PATH, 
                    max_seq_length=MAX_SEQ_LENGTH)
    bc.build_model_architecture()
    bc.load_model_weights(model_weights_path=BERT_CVE_MODEL_PATH)
    
    # model inference
    test_predictions = bc.model_estimator.predict(x=[btp.input_ids, 
                                                     btp.input_masks, 
                                                     btp.segment_ids],
                                                  batch_size=512,
                                                  verbose=1)

Loading Base BERT Model


Converting text to examples: 100it [00:00, 165325.34it/s]
Converting examples to features:   0%|          | 0/100 [00:00<?, ?it/s]

Loading BERT WordPiece Tokenizer
Creating Input Examples from data
Creating BERT Input Features from Input Examples


Converting examples to features: 100%|██████████| 100/100 [00:02<00:00, 44.46it/s]


Build BERT Classifier CVE Model Architecture
Loading Base BERT Model
Trainable layers: 199
Non Trainable layers: 5
Constructing Base BERT architecture
Loading BERT Classifier CVE Model Weights


In [13]:
from sklearn.metrics import confusion_matrix, classification_report

test_preds = test_predictions.ravel()
test_preds = [1 if pred > 0.5 else 0 for pred in test_preds]
print('Classification Report:')
print(classification_report(y_true=test_labels[:100], y_pred=test_preds))
print(confusion_matrix(y_true=test_labels[:100], y_pred=test_preds))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        96
           1       1.00      0.75      0.86         4

    accuracy                           0.99       100
   macro avg       0.99      0.88      0.93       100
weighted avg       0.99      0.99      0.99       100

[[96  0]
 [ 1  3]]
