## Fine Tuning the Pretrained DistilBert on Yelp Review Dataset for Sentiment Prediction

In [70]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.model_selection import train_test_split
import transformers
import tensorflow as tf
import datasets
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import pipeline

In [2]:
df = pd.read_csv("yelp.csv")
df_bert = df[["text", "stars"]]

## Preprocess

In [3]:
df_bert["stars"] = df_bert["stars"].apply(lambda x:1 if x in [4,5] else (0 if x in [1,2] else 3))
df_bert.drop(df_bert[df_bert.stars == 3].index, inplace = True)

In [4]:
df_bert

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,1
1,I have no idea why some people give bad review...,1
2,love the gyro plate. Rice is so good and I als...,1
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",1
4,General Manager Scott Petello is a good egg!!!...,1
...,...,...
9994,Let's see...what is there NOT to like about Su...,1
9996,Should be called house of deliciousness!\n\nI ...,1
9997,I recently visited Olive and Ivy for business ...,1
9998,My nephew just moved to Scottsdale recently so...,0


In [5]:
df_bert.shape

(8539, 2)

In [55]:
train, eval = train_test_split(df_bert,
                               test_size = 0.2,
                               random_state = 123)

### ------------------------------------------------------------------------------------------------------------------------------

In [12]:
# export train, valid, test in csv format
train.to_csv("/Users/alex/Desktop/github_repo/NLP/finetuning_bert/train_ft.csv", index=False, header = True)
valid.to_csv("/Users/alex/Desktop/github_repo/NLP/finetuning_bert/valid_ft.csv", index=False, header = True)
test.to_csv("/Users/alex/Desktop/github_repo/NLP/finetuning_bert/test_ft.csv", index=False, header = True)

In [15]:
# Load train, valid, test csv datasets converting them into datasets.dataDict format for Arrow
dataset = load_dataset('csv', data_files={'train': 'train_ft.csv', 
                                          'valid':'valid_ft.csv', 
                                          'test':'test_ft.csv'})


Using custom data configuration default-5945906af8db4695


Downloading and preparing dataset csv/default to /Users/alex/.cache/huggingface/datasets/csv/default-5945906af8db4695/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /Users/alex/.cache/huggingface/datasets/csv/default-5945906af8db4695/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

### ------------------------------------------------------------------------------------------------------------------------------

In [56]:
# convert train, valid, test datasets into dataDict format for the use of Arrow

train_ = Dataset.from_pandas(train)
eval_ = Dataset.from_pandas(eval)

In [58]:
# train test split (test = set for validation)

train_ = train_.train_test_split(test_size = 0.2)

In [66]:
train_

DatasetDict({
    train: Dataset({
        features: ['text', 'stars'],
        num_rows: 5464
    })
    test: Dataset({
        features: ['text', 'stars'],
        num_rows: 1367
    })
})

In [101]:
# tokenizer for the pretrained distilbert

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [102]:
# Tokenize all the dataDicts padding and truncating the texts

def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

train_token = train_['train'].map(tokenize_function, batched=True)
test_token = train_['test'].map(tokenize_function, batched=True)
eval_token = eval_.map(tokenize_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [103]:
# Use only subset of the datasets
train_sub = train_token.shuffle(seed=123).select(range(1000))
test_sub = valid_token.shuffle(seed=123).select(range(100))
eval_sub = test_token.shuffle(seed=123).select(range(100))

In [99]:
# Load the pretrained distilBert model 
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [104]:
# Set the datasets in tensorflow format
train_tf = train_sub.remove_columns(["text"]).with_format("tensorflow")
test_tf = test_sub.remove_columns(["text"]).with_format("tensorflow")
eval_tf = eval_sub.remove_columns(["text"]).with_format("tensorflow")

In [123]:
# convert everything in big tensor
# batch_size = 8
train_features = {x: train_tf[x] for x in ['input_ids','attention_mask']} 
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_tf["stars"]))
train_tf_dataset = train_tf_dataset.shuffle(len(train_tf)).batch(32)

test_features = {x: test_tf[x] for x in ['input_ids','attention_mask']}
test_tf_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_tf["stars"]))
test_tf_dataset = test_tf_dataset.batch(32)

eval_features = {x: eval_tf[x] for x in ['input_ids','attention_mask']}
eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, eval_tf["stars"]))
eval_tf_dataset = eval_tf_dataset.batch(32)

In [124]:
# Compile and train the model with keras
# model = distilbert-base-cased
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(train_tf_dataset, validation_data=eval_tf_dataset, epochs=2)

Epoch 1/2

InvalidArgumentError:  indices[0,107] = 29387 is not in [0, 28996)
	 [[node tf_bert_for_sequence_classification/bert/embeddings/Gather
 (defined at /opt/anaconda3/lib/python3.7/site-packages/transformers/models/bert/modeling_tf_bert.py:191)
]] [Op:__inference_test_function_63749]

Errors may have originated from an input operation.
Input Source operations connected to node tf_bert_for_sequence_classification/bert/embeddings/Gather:
In[0] tf_bert_for_sequence_classification/bert/embeddings/Gather/resource:	
In[1] IteratorGetNext (defined at /opt/anaconda3/lib/python3.7/site-packages/keras/engine/training.py:1355)

Operation defined at: (most recent call last)
>>>   File "/opt/anaconda3/lib/python3.7/runpy.py", line 193, in _run_module_as_main
>>>     "__main__", mod_spec)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/runpy.py", line 85, in _run_code
>>>     exec(code, run_globals)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
>>>     app.launch_new_instance()
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/traitlets/config/application.py", line 846, in launch_instance
>>>     app.start()
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 677, in start
>>>     self.io_loop.start()
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 199, in start
>>>     self.asyncio_loop.run_forever()
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/asyncio/base_events.py", line 534, in run_forever
>>>     self._run_once()
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/asyncio/base_events.py", line 1771, in _run_once
>>>     handle._run()
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/asyncio/events.py", line 88, in _run
>>>     self._context.run(self._callback, *self._args)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 457, in dispatch_queue
>>>     await self.process_one()
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 446, in process_one
>>>     await dispatch(*args)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 353, in dispatch_shell
>>>     await result
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 648, in execute_request
>>>     reply_content = await reply_content
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 353, in do_execute
>>>     res = shell.run_cell(code, store_history=store_history, silent=silent)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
>>>     return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2902, in run_cell
>>>     raw_cell, store_history, silent, shell_futures)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2947, in _run_cell
>>>     return runner(coro)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
>>>     coro.send(None)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3173, in run_cell_async
>>>     interactivity=interactivity, compiler=compiler, result=result)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3364, in run_ast_nodes
>>>     if (await self.run_code(code, result,  async_=asy)):
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
>>>     exec(code_obj, self.user_global_ns, self.user_ns)
>>> 
>>>   File "/var/folders/nx/cr2rq5412hv8xlv04b3g1qn00000gn/T/ipykernel_41213/2020694759.py", line 9, in <module>
>>>     model.fit(train_tf_dataset, validation_data=eval_tf_dataset, epochs=2)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/engine/training.py", line 1263, in fit
>>>     _use_cached_eval_dataset=True)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/engine/training.py", line 1537, in evaluate
>>>     tmp_logs = self.test_function(iterator)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/engine/training.py", line 1366, in test_function
>>>     return step_function(self, iterator)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/engine/training.py", line 1356, in step_function
>>>     outputs = model.distribute_strategy.run(run_step, args=(data,))
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/engine/training.py", line 1349, in run_step
>>>     outputs = model.test_step(data)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/transformers/modeling_tf_utils.py", line 909, in test_step
>>>     y_pred = self(x, training=False)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/engine/base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/transformers/models/bert/modeling_tf_bert.py", line 1749, in call
>>>     outputs = self.bert(
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/engine/base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/transformers/models/bert/modeling_tf_bert.py", line 789, in call
>>>     embedding_output = self.embeddings(
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/engine/base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/transformers/models/bert/modeling_tf_bert.py", line 190, in call
>>>     if input_ids is not None:
>>> 
>>>   File "/opt/anaconda3/lib/python3.7/site-packages/transformers/models/bert/modeling_tf_bert.py", line 191, in call
>>>     inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
>>> 