## Fine Tuning the Pretrained DistilBert on Yelp Review Dataset for Sentiment Prediction

In [53]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Jan 18 05:28:49 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    32W / 250W |  15853MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [64]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import transformers
import tensorflow as tf
import datasets
from datasets import Dataset, load_dataset
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
from transformers import pipeline
from datasets import load_metric

In [None]:
df = pd.read_csv("yelp.csv")
df_bert = df[["text", "stars"]]

## Preprocess

In [None]:
df_bert["stars"] = df_bert["stars"].apply(lambda x:1 if x in [4,5] else (0 if x in [1,2] else 3))
df_bert.drop(df_bert[df_bert.stars == 3].index, inplace = True)

In [None]:
df_bert

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,1
1,I have no idea why some people give bad review...,1
2,love the gyro plate. Rice is so good and I als...,1
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",1
4,General Manager Scott Petello is a good egg!!!...,1
...,...,...
9994,Let's see...what is there NOT to like about Su...,1
9996,Should be called house of deliciousness!\n\nI ...,1
9997,I recently visited Olive and Ivy for business ...,1
9998,My nephew just moved to Scottsdale recently so...,0


In [None]:
df_bert.shape

(8539, 2)

In [None]:
train, eval = train_test_split(df_bert,
                               test_size = 0.2,
                               random_state = 123)

### ------------------------------------------------------------------------------------------------------------------------------

In [None]:
# export train, valid, test in csv format
train.to_csv("/Users/alex/Desktop/github_repo/NLP/finetuning_bert/train_ft.csv", index=False, header = True)
valid.to_csv("/Users/alex/Desktop/github_repo/NLP/finetuning_bert/valid_ft.csv", index=False, header = True)
test.to_csv("/Users/alex/Desktop/github_repo/NLP/finetuning_bert/test_ft.csv", index=False, header = True)

In [None]:
# Load train, valid, test csv datasets converting them into datasets.dataDict format for Arrow
dataset = load_dataset('csv', data_files={'train': 'train_ft.csv', 
                                          'valid':'valid_ft.csv', 
                                          'test':'test_ft.csv'})


Using custom data configuration default-5945906af8db4695


Downloading and preparing dataset csv/default to /Users/alex/.cache/huggingface/datasets/csv/default-5945906af8db4695/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /Users/alex/.cache/huggingface/datasets/csv/default-5945906af8db4695/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

### ------------------------------------------------------------------------------------------------------------------------------

In [None]:
# convert train, valid, test datasets into dataDict format for the use of Arrow

train_ = Dataset.from_pandas(train)
eval_ = Dataset.from_pandas(eval)

In [None]:
# train test split (test = set for validation)

train_ = train_.train_test_split(test_size = 0.2)

In [None]:
train_

DatasetDict({
    train: Dataset({
        features: ['text', 'stars', '__index_level_0__'],
        num_rows: 5464
    })
    test: Dataset({
        features: ['text', 'stars', '__index_level_0__'],
        num_rows: 1367
    })
})

In [65]:
# tokenizer for the pretrained distilbert

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [66]:
# Tokenize all the dataDicts padding and truncating the texts

def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

train_token = train_['train'].map(tokenize_function, batched=True)
test_token = train_['test'].map(tokenize_function, batched=True)
eval_token = eval_.map(tokenize_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [67]:
# Use only subset of the datasets
train_sub = train_token.shuffle(seed=123).select(range(500))
test_sub = test_token.shuffle(seed=123).select(range(50))
eval_sub = eval_token.shuffle(seed=123).select(range(50))

In [68]:
# Load the pretrained distilBert model 
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_57', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [69]:
# Set the datasets in tensorflow format
train_tf = train_sub.remove_columns(["text"]).with_format("tensorflow")
test_tf = test_sub.remove_columns(["text"]).with_format("tensorflow")
eval_tf = eval_sub.remove_columns(["text"]).with_format("tensorflow")

In [70]:
train_tf

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'input_ids', 'stars'],
    num_rows: 500
})

In [71]:
# convert everything in big tensor
# batch_size = 8
train_features = {x: train_tf[x] for x in ['input_ids','attention_mask']} 
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_tf["stars"]))
train_tf_dataset = train_tf_dataset.shuffle(len(train_tf)).batch(8)

test_features = {x: test_tf[x] for x in ['input_ids','attention_mask']}
test_tf_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_tf["stars"]))
test_tf_dataset = test_tf_dataset.batch(8)

eval_features = {x: eval_tf[x] for x in ['input_ids','attention_mask']}
eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, eval_tf["stars"]))
eval_tf_dataset = eval_tf_dataset.batch(8)

In [72]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay

batch_size = 8
num_epochs = 3
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs
num_train_steps = len(train_tf_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=lr_scheduler)

In [73]:
# Compile and train the model with keras
# model = distilbert-base-cased
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(train_tf_dataset, validation_data=eval_tf_dataset, epochs=num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb8ef555050>

In [74]:
# Evaluate on eval_tf_dataset

pred = model.predict(test_tf_dataset)["logits"]

class_pred = np.argmax(pred, axis=1)
print(pred.shape, class_pred.shape)

(50, 2) (50,)


In [75]:
metric = load_metric("glue", "mrpc")
metric.compute(predictions = class_pred, references = test_sub["stars"])

{'accuracy': 0.94, 'f1': 0.9647058823529412}