In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import transformers, datasets

In [2]:
tf.keras.backend.clear_session()

physical_devices = tf.config.list_physical_devices('GPU') 
for device in physical_devices:
    tf.config.experimental.set_memory_growth(device, True)

In notebook 1, we trained the BERTweet transformer model on the tweet_eval sentiment dataset (saved as 'bertweet_simple'). 
In notebook 2, we further fine-tuned by re-training that model on a specialized dataset consisting of tweet data related to
Apple ('bertweet_aapl'). In this notebook, we'll repeat the process by re-training the simple model on an amalgamation of three
labeled datasets: the AAPL data as in notebook 2, plus the Sanders dataset and a collection of tweets dated to 2020, both related
to stock performance. (The training pipeline is parallel to the one used in notebook 2.)

In [3]:
from datasets import Features, Value, ClassLabel, load_dataset

features = Features({
    'text': Value(dtype='string'),
    'label': ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'])})

stock_tweet_dataset = load_dataset(
    path='csv', 
    data_files=['./data/aapl.csv', './data/sanders.csv', './data/tweets_labeled_20200904.csv'], 
    features=features)

stock_tweet_dataset

Using custom data configuration default-10917645a3750407


Downloading and preparing dataset csv/default to C:\Users\Kaya\.cache\huggingface\datasets\csv\default-10917645a3750407\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\Kaya\.cache\huggingface\datasets\csv\default-10917645a3750407\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7704
    })
})

In [4]:
stock_tweet_dataset['train'][50]

{'text': '@apple and @facebook I speak for all of humanity; We want to chose what contacts are added to our contacts book from #facebook #thanks',
 'label': 0}

In [5]:
# split data into a training set and a combined test+validation set
stock_tweet_dataset_split_1 = stock_tweet_dataset['train'].train_test_split(test_size=0.2, shuffle=True, seed=42) 

In [6]:
# split test+validation data 
stock_tweet_dataset_split_2 = stock_tweet_dataset_split_1['test'].train_test_split(test_size=0.7, shuffle=True, seed=84)

In [7]:
from datasets import DatasetDict

stock_tweet_dataset_split_full = DatasetDict({
    'train':      stock_tweet_dataset_split_1['train'],
    'test':       stock_tweet_dataset_split_2['test'],
    'validation': stock_tweet_dataset_split_2['train']})

stock_tweet_dataset_split_full

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6163
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1079
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 462
    })
})

In [8]:
stock_tweet_dataset_split_full['train'][50]

{'text': 'I am looking to #invest in #stocks and I want to invest in a #blackowned #tech or #sustainablefashion company. If you have any recommendations, please let me know 🙏🏾💕',
 'label': 2}

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
print(tokenizer("The quick brown fox jumps over the lazy dog."))

{'input_ids': [0, 47, 1600, 3345, 9646, 13545, 141, 6, 2307, 10638, 4, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [11]:
stock_tweet_dataset_split_full['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'], id=None)}

In [12]:
def tokenize_fn(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_stock_tweet = stock_tweet_dataset_split_full.map(tokenize_fn)



0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [13]:
from transformers.data.data_collator import tf_default_data_collator

data_collator = tf_default_data_collator

In [14]:
tf_train_dataset = tokenized_stock_tweet['train'].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_stock_tweet['validation'].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

tf_test_dataset = tokenized_stock_tweet['test'].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [15]:
from transformers import TFAutoModelForSequenceClassification

bertweet_simple = TFAutoModelForSequenceClassification.from_pretrained("./models/bertweet_simple/", num_labels=3)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at C:/Users/Kaya/Documents/capstone/models/bertweet_simple/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [16]:
from transformers import create_optimizer

batch_size = 8
num_epochs = 3
batches_per_epoch = len(tf_train_dataset) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=5e-6, num_warmup_steps=0, num_train_steps=total_train_steps)

In [17]:
bertweet_simple.compile(
    optimizer = optimizer,
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics = [tf.metrics.SparseCategoricalAccuracy()],
)

In [18]:
bertweet_simple.evaluate(tf_test_dataset)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


[0.909382700920105, 0.6533827781677246]

In [19]:
bertweet_stock_tweet = bertweet_simple

bertweet_stock_tweet.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1acb97ee940>

In [20]:
bertweet_stock_tweet.evaluate(tf_test_dataset)



[0.6167556047439575, 0.7330861687660217]

The 'bertweet_simple' model achieved 0.6533 accuracy on the test split of the combined data, while our newly trained 
model 'bertweet_stock_tweet' performed at 0.7331. This is a similar jump in accuracy as we saw in the AAPL-specific
model on a larger and slightly more heterogeneous dataset. 

In [21]:
bertweet_stock_tweet.save_pretrained("./models/bertweet_stock_tweet/")

In [22]:
aapl_dataset = load_dataset(
    path='csv', 
    data_files=['./data/aapl.csv'], 
    features=features)

aapl_dataset

Using custom data configuration default-06c6b7501b5e7d6e


Downloading and preparing dataset csv/default to C:\Users\Kaya\.cache\huggingface\datasets\csv\default-06c6b7501b5e7d6e\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\Kaya\.cache\huggingface\datasets\csv\default-06c6b7501b5e7d6e\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2980
    })
})

In [23]:
tokenized_aapl_dataset = aapl_dataset.map(tokenize_fn)

0ex [00:00, ?ex/s]

In [25]:
tf_aapl_dataset = tokenized_aapl_dataset['train'].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [31]:
bertweet_stock_tweet.evaluate(tf_aapl_dataset)



[0.4825942814350128, 0.8050335645675659]