In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import transformers, datasets

In [2]:
tf.keras.backend.clear_session()

physical_devices = tf.config.list_physical_devices('GPU') 
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)

In the previous notebook, we used the Hugging Face libraries to load the BERTweet transformer model and train it
on the tweet_eval sentiment dataset. Now we'll take that model and train it on a more specialized datasets. Here that
will be a small-ish labeled dataset consisting of tweets related to Apple (APPL). The labels again come in three classes
indicating positive, negative, and neutral sentiment.

Rather than importing the data directly from Hugging Face, it will be taken from a csv file, so we'll have to take care
to load it into the same object type.

In [3]:
from datasets import Features, Value, ClassLabel, load_dataset

features = Features({
    'text': Value(dtype='string'),
    'label': ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'])})

aapl_dataset = load_dataset(path='csv', data_files=['./data/aapl.csv'], features=features)

aapl_dataset

Using custom data configuration default-06c6b7501b5e7d6e
Reusing dataset csv (C:\Users\Kaya\.cache\huggingface\datasets\csv\default-06c6b7501b5e7d6e\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2980
    })
})

In [4]:
aapl_dataset['train'][50]

{'text': '@apple and @facebook I speak for all of humanity; We want to chose what contacts are added to our contacts book from #facebook #thanks',
 'label': 0}

As we can see, the load_dataset() function has created a DatasetDict object. Unlike the previous case, here the data
was not divided into splits. At the moment, aapl_dataset contains a single Dataset object called 'train'. Let's create
training, validation, and test splits.

In [5]:
# split data into a training set and a combined test+validation set
aapl_dataset_split_1 = aapl_dataset['train'].train_test_split(test_size=0.2, shuffle=True, seed=42) 

Loading cached split indices for dataset at C:\Users\Kaya\.cache\huggingface\datasets\csv\default-5610044f9daaa1b4\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-6b51b7dc9839905b.arrow and C:\Users\Kaya\.cache\huggingface\datasets\csv\default-5610044f9daaa1b4\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-ecec47d30393e3ed.arrow


In [6]:
# split test+validation data 
aapl_dataset_split_2 = aapl_dataset_split_1['test'].train_test_split(test_size=0.7, shuffle=True, seed=84)

Loading cached split indices for dataset at C:\Users\Kaya\.cache\huggingface\datasets\csv\default-5610044f9daaa1b4\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-eb3f9e6e19fa32d2.arrow and C:\Users\Kaya\.cache\huggingface\datasets\csv\default-5610044f9daaa1b4\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-a0ae947f87b867e5.arrow


In [7]:
from datasets import DatasetDict

aapl_dataset_split_full = DatasetDict({
    'train':      aapl_dataset_split_1['train'],
    'test':       aapl_dataset_split_2['test'],
    'validation': aapl_dataset_split_2['train']})

aapl_dataset_split_full

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2384
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 418
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 178
    })
})

In [8]:
aapl_dataset_split_full['train'][50]

{'text': 'Thank you @Apple for fixing the #Swift sourcekit crashes in #XCode :). Life is better now! ',
 'label': 2}

Now as in the previous notebook, the DatasetDict object consists of three splits as Dataset objects. We once again load
the tokenizer using AutoTokenizer.

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
print(tokenizer("The quick brown fox jumps over the lazy dog."))

{'input_ids': [0, 47, 1600, 3345, 9646, 13545, 141, 6, 2307, 10638, 4, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [11]:
aapl_dataset_split_full['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'], id=None)}

Now we tokenize the entire dataset (all three splits)

In [12]:
def tokenize_fn(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_aapl = aapl_dataset_split_full.map(tokenize_fn)



0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

Here we convert the tokenized splits into TensorFlow objects.

In [13]:
from transformers.data.data_collator import tf_default_data_collator

data_collator = tf_default_data_collator

In [14]:
tf_train_dataset = tokenized_aapl['train'].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_aapl['validation'].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

tf_test_dataset = tokenized_aapl['test'].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

Now we load the model we previously trained and proceed to train it on the AAPL data.

In [15]:
from transformers import TFAutoModelForSequenceClassification

bertweet_simple = TFAutoModelForSequenceClassification.from_pretrained("./models/bertweet_simple/", num_labels=3)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at C:/Users/Kaya/Documents/capstone/models/bertweet_simple/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [16]:
from transformers import create_optimizer

batch_size = 8
num_epochs = 3
batches_per_epoch = len(tf_train_dataset) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=5e-6, num_warmup_steps=0, num_train_steps=total_train_steps)

In [17]:
bertweet_simple.compile(
    optimizer = optimizer,
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics = [tf.metrics.SparseCategoricalAccuracy(), tf.keras.metrics.MeanAbsoluteError()],
)

In [18]:
bertweet_simple.evaluate(tf_test_dataset)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


[0.7434712052345276, 0.6770334839820862, 1.8227293491363525]

In [19]:
bertweet_aapl = bertweet_simple

bertweet_aapl.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x252386ac9d0>

In [20]:
bertweet_aapl.evaluate(tf_test_dataset)



[0.5040907263755798, 0.7822966575622559, 1.7455029487609863]

The model from the first notebook (here called 'bertweet_simple' has an accuracy of 0.6770 on the test split, while the
newly trained 'bertweet_aapl' model has accuracy of 0.7823.

In [21]:
bertweet_aapl.save_pretrained("./models/bertweet_aapl/")