# Fine Tuning For BERT Model

## Import Library

In [43]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [44]:
df = pd.read_pickle('clean_train.pkl')
df['clean_text_lemmas'] = [' '.join(text) for text in df['lemmas']]
df['clean_text_lemmas'] = df['clean_text_lemmas'].astype('str')

In [45]:
X = df['cleaned_text'].values
y = df['target'].values

## Tokenization

### Fixing token counts of all documents

In [46]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

max_len = 0

# For every sentence...
max_token = []
for sent in X:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    
    input_ids = tokenizer.encode(sent.lower(), add_special_tokens=True)
    max_token.append(len(input_ids))

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  38


In [47]:
np.array(max_token).mean()

16.395901747011692

In [48]:
sum(np.array(max_token) < 78) / len(max_token)

1.0

In [49]:
def transformation(X):
  # set array dimensions
  seq_len = 38
  num_samples = len(X)

  # initialize empty zero arrays
  Xids = np.zeros((num_samples, seq_len))
  Xmask = np.zeros((num_samples, seq_len))

    
  for i, phrase in enumerate(X):
      tokens = tokenizer.encode_plus(phrase.lower(), max_length=seq_len, truncation=True,
                                      padding='max_length', add_special_tokens=True) 
      # assign tokenized outputs to respective rows in numpy arrays
      Xids[i, :] = tokens['input_ids']
      Xmask[i, :] = tokens['attention_mask']
  return Xids, Xmask

In [50]:
Xids, Xmask = transformation(X)

In [51]:
Xids.shape

(7613, 38)

In [52]:
Xmask.shape

(7613, 38)

In [53]:
from tensorflow.keras.utils import to_categorical

l = np.array([4, 1, 2, 3, 0])
to_categorical(l, 5) 

array([[0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.]], dtype=float32)

In [54]:
labels = y.reshape(-1,1)
labels

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]], dtype=int64)

In [55]:
import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels)) #tf.data.Dataset.from_tensors((Xids, Xmask, labels))
dataset

<TensorSliceDataset element_spec=(TensorSpec(shape=(38,), dtype=tf.float64, name=None), TensorSpec(shape=(38,), dtype=tf.float64, name=None), TensorSpec(shape=(1,), dtype=tf.int64, name=None))>

In [56]:
len(dataset)

7613

In [57]:
import tensorflow as tf
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

def map_func(Xids, Xmask, labels):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': Xids, 'attention_mask': Xmask}, labels

# then we use the dataset map method to apply this transformation
dataset = dataset.map(map_func)

## Train Test Split

In [58]:
batch_size = 16

# shuffle and batch
# fit into a batch of 32
dataset = dataset.shuffle(7613, reshuffle_each_iteration=False).batch(batch_size, drop_remainder=True)

In [59]:
# set split size (90% training data) and calculate training set size
split = 0.9
size = int(len(dataset)*split) #int((Xids.shape[0]/batch_size)*split)

# get training and validation sets
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

In [60]:
print(len(dataset))
print(len(dataset)*0.9)
print(int(len(dataset)*0.9))

475
427.5
427


In [61]:
#batch_size=16
int(Xids.shape[0]/batch_size)

475

## Creating Model

In [62]:
def create_model():
    from transformers import TFBertModel
    model = TFBertModel.from_pretrained("bert-large-uncased")
    input_ids = tf.keras.layers.Input(shape=(38,), name='input_ids', dtype='int32')
    attention_mask = tf.keras.layers.Input(shape=(38,), name='attention_mask', dtype='int32')

    embeddings = model.bert(input_ids=input_ids, attention_mask=attention_mask)["pooler_output"] #[1]

    x = tf.keras.layers.Dense(64, activation='relu')(embeddings)
    x = tf.keras.layers.Dropout(0.1, name="dropout")(x) #0.1
    y = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')(x)

    return tf.keras.Model(inputs=[input_ids, attention_mask], outputs=y)

In [63]:

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5) #3e-5, 5e-5
loss = tf.keras.losses.BinaryCrossentropy()
recall = tf.keras.metrics.Recall()
model = create_model()
model.compile(optimizer=optimizer, loss=loss, metrics=[recall])

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [64]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 38)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 38)]         0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  335141888   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 38,                                          

In [65]:
history = model.fit(
    train_ds, validation_data= val_ds,
    epochs=1) #epoch= 2 or 3



## Model evaluation

In [75]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(val_ds) >= 0.55

y_test = []
for i in val_ds:
  for j in np.array(i[1]):
    y_test.append(j)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      1.00      0.74       452
           1       0.00      0.00      0.00       316

    accuracy                           0.59       768
   macro avg       0.29      0.50      0.37       768
weighted avg       0.35      0.59      0.44       768



In [67]:
# y_train_pred = model3.predict(train_ds) >= 0.5

# y_train = []
# for i in train_ds:
#   for j in np.array(i[1]):
#     y_train.append(j)

# print(classification_report(y_train, y_train_pred)) 

In [68]:
model.save("bertlarge_tweeterdisaster.h5")

In [69]:
# from tensorflow.keras.models import load_model


# model4 = load_model('/content/drive/MyDrive/sentiment_model_without_weighted.h5')

In [70]:
# initialize tokenizer from transformers
from transformers import BertTokenizer, TFBertModel

tokenizers = BertTokenizer.from_pretrained("bert-large-uncased")

def prep_data(text):
    # tokenize to get input IDs and attention mask tensors
    tokens = tokenizers.encode_plus(text.lower(), max_length=38,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True,
                                   return_tensors='tf')
  
    return {'input_ids': tokens['input_ids'],      #tf.cast(tokens['input_ids'], tf.int32)
            'attention_mask': tokens['attention_mask']} #tf.cast(tokens['attention_mask'], tf.int32)

In [71]:
df_test = pd.read_pickle('clean_test.pkl')
df_test['clean_text_lemmas'] = [' '.join(text) for text in df_test['lemmas']]
df_test['clean_text_lemmas'] = df_test['clean_text_lemmas'].astype('str')

In [72]:
probs = []
for i in df_test['cleaned_text'].values:

  in_tensor = prep_data(i)

  prob = model.predict(in_tensor)[0][0]

  probs.append(prob)



In [78]:
classes  = (np.array(probs) >= 0.6).astype("int")
my_dict = {'id': df_test['id'].values, "target":classes}
df_test = pd.DataFrame(my_dict)

In [77]:
df_test.to_csv('submission15.csv', index=False)