In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras

keras.utils.set_random_seed(42)

In [2]:
# install transformer package
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 7.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 42.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 58.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


In [3]:
from transformers import BertTokenizer, TFBertModel

In [4]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-3b2aa356-6893-ad49-3595-5a7a03a33d6f)


In [11]:
dataset = pd.read_csv('https://www.dropbox.com/s/flgofaf8hdaw4jr/training.1600000.processed.noemoticon.csv?dl=1', 
                      on_bad_lines='skip',
                      encoding='latin-1',
                      header=None)

In [13]:
# random sample data
dataset = dataset.iloc[:,[0,-1]].sample(frac=1)

In [14]:
dataset.shape

(1600000, 2)

In [15]:
# try with a subset of all data
num_data = dataset.shape[0] * (1/10)
dataset = dataset.iloc[:int(num_data),:]
train = dataset.iloc[:int(num_data*(3/4)),:]
test = dataset.iloc[int(num_data*(3/4)):,:]

In [16]:
train.shape

(120000, 2)

In [17]:
train_X = train.iloc[:,-1]
train_y = train.iloc[:,0]
test_X = test.iloc[:,-1]
test_y = test.iloc[:,0]

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [21]:
max_query_length = 100

In [22]:
# tokenize both the training set and the test set
source_train = tokenizer(train_X.values.tolist(), max_length=max_query_length, return_tensors='tf', truncation=True, padding='max_length')
source_test = tokenizer(test_X.values.tolist(), max_length=max_query_length, return_tensors='tf', truncation=True, padding='max_length')

In [23]:
# tokenize labels
text_vectorization_label = keras.layers.TextVectorization()
text_vectorization_label.adapt(train_y.astype(str))
num_labels = text_vectorization_label.vocabulary_size()

label_train = text_vectorization_label(train_y.astype(str))
label_test = text_vectorization_label(test_y.astype(str))

In [24]:
input_ids = keras.layers.Input(shape=(max_query_length,), name='input_ids', dtype='int32')
token_type_ids = keras.layers.Input(shape=(max_query_length,), name='token_type_ids', dtype='int32')
attention_mask = keras.layers.Input(shape=(max_query_length,), name='attention_mask', dtype='int32') 

embedding_layer = bert.bert(input_ids, token_type_ids, attention_mask)

CLS = embedding_layer.pooler_output

# Classifier
outputs = keras.layers.Dense(num_labels, activation="softmax")(CLS)

model = keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=outputs)

for layer in model.layers:
  layer.trainable = True

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'token_type_ids[0][0]',     

In [25]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["sparse_categorical_accuracy"])

In [26]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [27]:
from keras import backend as K
K.set_value(model.optimizer.learning_rate, 0.00005)

In [28]:
BATCH_SIZE = 64
epochs = 1

# Fit
history = model.fit([source_train['input_ids'], source_train['token_type_ids'], source_train['attention_mask']], label_train,
                    batch_size=BATCH_SIZE,
                    validation_data=([source_test['input_ids'], source_test['token_type_ids'], source_test['attention_mask']], label_test),
                    epochs=epochs)



In [29]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [30]:
model.save('/content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1')



In [31]:
!zip -r /content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1.zip /content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1

  adding: content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1/ (stored 0%)
  adding: content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1/variables/ (stored 0%)
  adding: content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1/variables/variables.data-00000-of-00001 (deflated 12%)
  adding: content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1/variables/variables.index (deflated 80%)
  adding: content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1/assets/ (stored 0%)
  adding: content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1/saved_model.pb (deflated 92%)
  adding: content/drive/MyDrive/ANLY-580/final_project/fine_tuned_bert_1/keras_metadata.pb (deflated 95%)
