In [1]:
import pandas as pd
import numpy as np


In [2]:
dataset = pd.read_csv('..\Datasets\Text_emotion.csv')

In [3]:
dataset.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [4]:
seq_len = 512
num_samples = len(dataset)

xids = np.zeros((num_samples, seq_len))
xmask = np.zeros((num_samples, seq_len))

xids.shape

(40000, 512)

In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, phrase in enumerate(dataset.content):
    tokens = tokenizer.encode_plus(phrase, max_length = seq_len, truncation = True, 
    padding='max_length', add_special_tokens=True, return_tensors = 'tf')

    xids[i, :] = tokens['input_ids']
    xmask[i, :] = tokens['attention_mask']

In [6]:
xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [7]:
dataset['sentiment_coded'] = dataset.sentiment.map({'empty':0, 'sadness':1, 'enthusiasm':2, 'neutral':3, 'worry':4, 'surprise':5,
       'love':6, 'fun':7, 'hate':8, 'happiness':9, 'boredom':10, 'relief':11, 'anger':12})

In [8]:
arr = dataset.sentiment_coded.values

In [9]:
arr

array([0, 1, 1, ..., 6, 9, 6], dtype=int64)

In [10]:
labels = np.zeros((num_samples, arr.max() + 1))

In [11]:
labels[np.arange(num_samples), arr] = 1


In [12]:
import tensorflow as tf

In [13]:
data = tf.data.Dataset.from_tensor_slices((xids, xmask, labels))

In [14]:
data.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.float64, name=None), TensorSpec(shape=(512,), dtype=tf.float64, name=None), TensorSpec(shape=(13,), dtype=tf.float64, name=None))>

In [15]:
labels[0,:]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [16]:
def map_func(inputs_ids, masks, labels):
    return {'input_ids': inputs_ids, 'attention_mask': masks}, labels

In [17]:
data = data.map(map_func)

In [18]:
data.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.float64, name=None)}, TensorSpec(shape=(13,), dtype=tf.float64, name=None))>

In [19]:
batch_size = 16

data = data.shuffle(7000).batch(batch_size, drop_remainder=True)

data.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.float64, name=None)}, TensorSpec(shape=(16, 13), dtype=tf.float64, name=None))>

In [20]:
split = 0.9

size = int((num_samples/batch_size)*split)

In [21]:
train_ds = data.take(size)
val_ds = data.skip(size)

In [22]:
del data

In [23]:
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-uncased')

bert.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


In [24]:
input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask = mask)[1]

x = tf.keras.layers.Dense(1024, activation = 'relu')(embeddings)
y = tf.keras.layers.Dense(arr.max() + 1, activation='softmax', name='outputs')(x)

In [25]:
model = tf.keras.Model(inputs=[input_ids, mask], outputs = y)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [26]:
optimizer = tf.keras.optimizers.Adam(lr=1e-5, decay= 1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')


  super(Adam, self).__init__(name, **kwargs)


In [27]:
model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [28]:
history = model.fit(
                train_ds,
                validation_data = val_ds,
                epochs = 5
)

Epoch 1/5
  25/2250 [..............................] - ETA: 36:22:00 - loss: 2.3472 - accuracy: 0.2600

In [None]:
model.save('sentiment_model')