In [4]:
# Data processing
import pandas as pd
import numpy as np

from ipywidgets import FloatProgress

# Train test split
from sklearn.model_selection import train_test_split

# Modeling
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# Hugging Face Dataset
from datasets import Dataset

# Import accuracy_score to check performance
from sklearn.metrics import accuracy_score

In [6]:
# Read in data
amz_review = pd.read_csv('amazon_cells_labelled.txt', sep='\t', names=['review', 'label'])

# Take a look at the data
amz_review.head()

Unnamed: 0,review,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [7]:
# Get the dataset information
amz_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  1000 non-null   object
 1   label   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [8]:
# Check the label distribution
amz_review['label'].value_counts()

0    500
1    500
Name: label, dtype: int64

In [9]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(amz_review['review'], 
                                                    amz_review['label'], 
                                                    test_size = 0.20, 
                                                    random_state = 42)

# Check the number of records in training and testing dataset.
print(f'The training dataset has {len(X_train)} records.')
print(f'The testing dataset has {len(X_test)} records.')

The training dataset has 800 records.
The testing dataset has 200 records.


* AutoTokenizer.from_pretrained("bert-base-cased") downloads vocabulary from the pretrained bert-base-cased model.  
* return_tensors="np" indicates that the return format is NumPy array. Besides np, return_tensors can take the value of tf or pt, where tf returns TensorFlow tf.constant object and pt returns PyTorch torch.tensor object. If not set, it returns a list of python integers.  
* padding means adding zeros to shorter reviews in the dataset. The padding argument controls how padding is implemented.  
* padding=True is the same as padding='longest'. It checks the longest sequence in the batch and pads zeros to that length. There is no padding if only one text document is provided.  
* padding='max_length' pads to max_length if it is specified, otherwise, it pads to the maximum acceptable input length for the model.  
* padding=False is the same as padding='do_not_pad'. It is the default, indicating that no padding is applied, so it can output a batch with sequences of different lengths.

In [10]:
# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Tokenize the reviews
tokenized_data_train = tokenizer(X_train.to_list(), return_tensors="np", padding=True)
tokenized_data_test = tokenizer(X_test.to_list(), return_tensors="np", padding=True)

# Labels are one-dimensional numpy or tensorflow array of integers
labels_train = np.array(y_train)  
labels_test = np.array(y_test) 

# Tokenized ids
print(tokenized_data_train["input_ids"][0])

Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 24.0kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 1.50MB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.02MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 1.37MB/s]


[  101 17554   112   189  2080  2965   119   102     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0]


* TFAutoModelForSequenceClassification loads the BERT model without the sequence classification head.  
* The method from_pretrained() loads the weights from the pretrained model into the new model, so the weights in the new model are not randomly initialized. Note that the new weights for the new sequence classification head are going to be randomly initialized.  
* bert-base-cased is the name of the pretrained model. We can change it to a different model based on the nature of the project.  
* num_labels indicates the number of classes. Our dataset has two classes, positive and negative, so num_labels=2.

In [11]:
# Load model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading tf_model.h5: 100%|██████████| 527M/527M [00:44<00:00, 11.8MB/s] 
2023-06-05 13:54:09.228688: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


* SparseCategoricalCrossentropy is used as the loss function, but the Hugging Face documentation mentioned that Hugging Face models automatically choose a loss that is appropriate for their task and model architecture if the loss is not explicitly specified.  
* from_logits=True informs the loss function that the output values are logits before applying softmax, so the values do not represent probabilities.  
* We are using Adam as the optimizer and the number 5e-6 is the learning rate. A smaller learning rate corresponds to a more stable weights value update and a slower training process.  
* accuracy is used as the metrics because we have a balanced dataset.

In [12]:
# Loss
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile model
model.compile(optimizer=Adam(5e-6), loss=loss, metrics=['accuracy'])

* batch_size=4 means that four reviews are processed for each weights and bias update.  
* epochs=2 means that the model fitting process will go through the training dataset 2 times.

In [None]:
# Fit the model
model.fit(dict(tokenized_data_train), 
          labels_train, 
          validation_data=(dict(tokenized_data_test), labels_test),
          batch_size=4, 
          epochs=2)

Epoch 1/2

* If we would like to keep the pretrained model weights as is and only update the weights and bias of the output layer, we can use model.layers[0].trainable = False to freeze the weights of the BERT model.  
* If we would like to keep the weights of some layers and update others, we can use model.bert.encoder.layer[i].trainable = False to freeze the weights of the corresponding layers.  
* In general, if the dataset for the transfer learning model is large, it is suggested to update all weights, and if the dataset for the transfer learning model is small, it is suggested to freeze the pretrained model weights. But we can always compare the model performance by adding the tunable pretrained model layers one by one.

In [None]:
# Predictions
y_test_predict = model.predict(dict(tokenized_data_test))['logits']

# First 5 predictions
y_test_predict[:5]

In [None]:
# Predicted probabilities
y_test_probabilities = tf.nn.softmax(y_test_predict)

# First 5 predicted probabilities
y_test_probabilities[:5]

In [None]:
# Predicted label
y_test_class_preds = np.argmax(y_test_probabilities, axis=1)

# First 5 predicted labels
y_test_class_preds[:5]

In [None]:
# Accuracy
accuracy_score(y_test_class_preds, y_test)

### Saving

In [None]:
# Save tokenizer
tokenizer.save_pretrained('./sentiment_transfer_learning_tensorflow/')

# Save model
model.save_pretrained('./sentiment_transfer_learning_tensorflow/')

### Loading

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./sentiment_transfer_learning_tensorflow/")

# Load model
loaded_model = TFAutoModelForSequenceClassification.from_pretrained('./sentiment_transfer_learning_tensorflow/')

### Sentiment Model Using Transfer Learning on Large Dataset

* Firstly, the python dataframe needs to be converted to the Hugging Face arrow dataset using Dataset.from_pandas()  
* Then a tokenizer needs to be initiated  
* After that, the tokenizer is applied to the Hugging Face arrow dataset  
* The pretrained model is loaded using TFAutoModelForSequenceClassification.from_pretrained()  
* Finally, the dataset is loaded using prepare_tf_dataset()

In [None]:
# Convert pyhton dataframe to Hugging Face arrow dataset
hg_amz_review = Dataset.from_pandas(amz_review)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["review"])

# Tokenize the dataset
dataset = hg_amz_review.map(tokenize_dataset)

# Load model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

# TF dataset
tf_dataset = model.prepare_tf_dataset(dataset=dataset, 
                                      batch_size=16, 
                                      shuffle=True, 
                                      tokenizer=tokenizer)

In [None]:
# Loss
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile model
model.compile(optimizer=Adam(5e-6), loss=loss, metrics=['accuracy'])

# Fit the model
model.fit(tf_dataset, 
          epochs=2)