# Load required package

In [1]:
import tensorflow_datasets as tfds
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf
import numpy as np
import pandas as pd 
import sklearn
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, metrics
from sklearn.metrics.classification import precision_score

import os
import sys



In [2]:
# Load the training data and the testing data

In [3]:
train_df = pd.read_csv("../data/train.csv")
test_df  = pd.read_csv("../data/test.csv")




In [4]:
class transformer():

    def __init__(self, X_data, y_data): 
        #use pre-trained tokenizers
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) 
        self.max_length = 512
        self.batch_size = 6
        self.X_train, self.X_test, self.y_train, self.y_test = model_selection.train_test_split(X_data, y_data, test_size = 0.2, random_state = 1)
        self.train_encoded = None
        self.test_encoded  = None 
        self.model = None

    def convert_example_to_feature(self, review):
        # combine step for tokenization, WordPiece vector mapping, adding special tokens as well as truncating reviews longer than the max length
        return self.tokenizer.encode_plus(review, 
                                 add_special_tokens = True, # add [CLS], [SEP]
                                 max_length = self.max_length, # max length of the text that can go to BERT
                                 pad_to_max_length = True, # add [PAD] tokens
                                 return_attention_mask = True, # add attention mask to not focus on pad tokens
              )

    def map_example_to_dict(self, input_ids, attention_masks, token_type_ids, label):
        # map to the expected input to TFBertForSequenceClassification
        return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_masks,
        } , label

    def encode_examples(self, X_train, y_train):
        # prepare list, so that we can build up final TensorFlow dataset from slices.
        input_ids_list = []
        token_type_ids_list = []
        attention_mask_list = []
        label_list = []
    
        for i in X_train.index: 
            bert_input = self.convert_example_to_feature(X_train[i])
            input_ids_list.append(bert_input['input_ids'])
            token_type_ids_list.append(bert_input['token_type_ids'])
            attention_mask_list.append(bert_input['attention_mask'])
            label_list.append([y_train[i]])
            
    
        return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(self.map_example_to_dict)

    def encoder(self):

        self.train_encoded = self.encode_examples(self.X_train, self.y_train).batch(self.batch_size) 
        self.test_encoded  = self.encode_examples(self.X_test,  self.y_test ).batch(self.batch_size) 

    #use already prepared TensorFlow models from transformers models
    def decoder(self): 

        ## model initialization 

        # recommended learning rate for Adam 5e-5, 3e-5, 2e-5
        learning_rate = 2e-2

        # we will do just 1 epoch for illustration, though multiple epochs might be better as                           long as we will not overfit the model
        number_of_epochs = 1


        # model initialization
        self.model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

        # choosing Adam optimizer
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

        # we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

        self.model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


        ## fine tuning 
        bert_history = self.model.fit(self.train_encoded, epochs=number_of_epochs, validation_data=self.test_encoded)
        print("a")
        return bert_history
        

## set up Encoder and decoder

In [5]:
Tran = transformer(train_df["text"], train_df["target"])
Tran.encoder()


Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
Tran.decoder()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
a


<tensorflow.python.keras.callbacks.History at 0x7fbfe6c77290>

In [7]:
Tran.model

<transformers.modeling_tf_bert.TFBertForSequenceClassification at 0x7fbfffafab90>