In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.tsv', sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
df = df[['Phrase', 'Sentiment']].head(1000)
df.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [4]:
from transformers import BertTokenizer

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [6]:
def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, max_length=512,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_token_type_ids=False,
                                   return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [1]:
import numpy as np

In [8]:
Xids = np.zeros((len(df), 512))
Xmask = np.zeros((len(df), 512))

In [9]:
Xids.shape

(1000, 512)

In [10]:
Xids

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
Xmask

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
for i, sequence in enumerate(df['Phrase']):
    tokens = tokenize(sequence)
    Xids[i, :], Xmask[i, :] = tokens[0], tokens[1]

In [13]:
arr = df['Sentiment'].values

In [14]:
arr

array([1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2,
       4, 3, 2, 3, 3, 3, 2, 2, 4, 2, 3, 4, 2, 2, 2, 1, 2, 2, 2, 3, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 0, 2, 0, 2, 1, 1, 1, 2, 2,
       1, 2, 2, 2, 2, 2, 3, 4, 4, 3, 3, 3, 3, 4, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 3, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 2, 3, 3, 3, 1,
       2, 2, 1, 0, 2, 0, 1, 2, 1, 1, 2, 2, 4, 3, 2, 2, 3, 2, 4, 2, 3, 2,
       4, 3, 3, 3, 4, 2, 4, 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       1, 2, 1, 0, 2, 1, 2, 2, 2, 1, 0, 1, 0, 1, 1, 3, 2, 3, 2, 3, 2, 2,
       3, 3, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 0, 2, 1,
       0, 0, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 4, 3, 2, 2, 2, 1,
       2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 3, 2,

In [15]:
labels = np.zeros((arr.size, arr.max()+1))

In [16]:
labels[np.arange(arr.size), arr] = 1

In [17]:
labels

array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [18]:
with open('movie-xids.npy', 'wb') as f:
    np.save(f, Xids)
with open('movie-xmask.npy', 'wb') as f:
    np.save(f, Xmask)
with open('movie-labels.npy', 'wb') as f:
    np.save(f, labels)

In [19]:
del df, Xids, Xmask, labels

In [2]:
import tensorflow as tf

In [3]:
with open('movie-xids.npy', 'rb') as f:
    Xids = np.load(f, allow_pickle=True)
with open('movie-xmask.npy', 'rb') as f:
    Xmask = np.load(f, allow_pickle=True)
with open('movie-labels.npy', 'rb') as f:
    labels = np.load(f, allow_pickle=True)

In [4]:
#gpu_devices = tf.config.experimental.list_physical_devices('GPU')
#for device in gpu_devices: tf.config.experimental.set_memory_growth(device, True)  # required to avoid GPU LSTM Internal Error
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
tf.__version__

'2.5.0-dev20201110'

In [6]:
data = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))  # [750000:850000]

In [7]:
SHUFFLE = 100000
BATCH_SIZE = 16

In [8]:
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [9]:
data = data.map(map_func)

In [10]:
data = data.shuffle(SHUFFLE).batch(BATCH_SIZE) #, drop_remainder=True)

In [11]:
SIZE = Xids.shape[0]/BATCH_SIZE
SIZE

62.5

In [12]:
SPLIT = 0.9

train = data.take(int(SIZE*SPLIT))
val = data.skip(int(SIZE*SPLIT))

del data

---

# Model Setup

In [13]:
from transformers import TFAutoModel

In [14]:
bert = TFAutoModel.from_pretrained('bert-base-cased')  #, output_hidden_states=False

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [15]:
bert.summary()

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [16]:
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask=mask)[0]  # we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)

x = tf.keras.layers.Dropout(0.1)(embeddings)
x = tf.keras.layers.GlobalMaxPool1D()(x)
y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(x)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

model.layers[2].trainable = False

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [17]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dropout_37 (Dropout)            (None, 512, 768)     0           bert[0][0]                   

In [18]:
optimizer = tf.keras.optimizers.Adam(lr=0.01, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [19]:
# 800K
history = model.fit(
    train,
    validation_data=val,
    epochs=2)

Epoch 1/2
Epoch 2/2


### Save

In [None]:
model.get_config()

In [20]:
model.save('sentiment_model')



INFO:tensorflow:Assets written to: sentiment_model\assets


INFO:tensorflow:Assets written to: sentiment_model\assets


In [21]:
del model

---

### Load

In [22]:
model = tf.keras.models.load_model('sentiment_model')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dropout_37 (Dropout)            (None, 512, 768)     0           bert[0][0]                   

### Test

In [23]:
loss, acc = model.evaluate(val)



In [25]:
model.predict(val.take(1))

array([[7.61600770e-03, 1.61399581e-02, 8.81537974e-01, 9.45726037e-02,
        1.33427122e-04],
       [7.02638784e-03, 2.99034566e-01, 5.73699236e-01, 1.20218106e-01,
        2.17534525e-05],
       [2.13267319e-02, 2.18926340e-01, 6.04214907e-01, 1.55486241e-01,
        4.58343056e-05],
       [6.09720917e-03, 5.33939958e-01, 1.81329250e-01, 2.78631181e-01,
        2.43848808e-06],
       [4.22458537e-03, 2.40804236e-02, 9.37473118e-01, 3.41925099e-02,
        2.94180991e-05],
       [4.67320643e-02, 1.65817395e-01, 2.94325203e-01, 4.93072838e-01,
        5.25151663e-05],
       [2.95730587e-02, 1.96691960e-01, 6.20805681e-01, 1.52830005e-01,
        9.93432404e-05],
       [2.04604883e-02, 9.57593694e-02, 5.34271896e-01, 3.49503189e-01,
        5.10916652e-06],
       [1.22610498e-02, 5.12559175e-01, 9.00671557e-02, 3.85110855e-01,
        1.68314602e-06],
       [1.17646297e-02, 7.94910938e-02, 7.08183110e-01, 2.00488031e-01,
        7.31378386e-05],
       [3.86042823e-03, 1.8145

In [24]:
val.take(1)

<TakeDataset shapes: ({input_ids: (None, 512), attention_mask: (None, 512)}, (None, 5)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>