## Packages

In [1]:
# Load packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from pathlib import Path
from transformers import BertTokenizer, TFBertModel
from urllib.request import urlretrieve

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SNIPS_DATA_BASE_URL = (
    "https://github.com/ogrisel/slot_filling_and_intent_detection_of_SLU/blob/"
    "master/data/snips/"
)
for filename in ["train", "valid", "test", "vocab.intent", "vocab.slot"]:
    path = Path(filename)
    if not path.exists():
        print(f"Downloading {filename}...")
        urlretrieve(SNIPS_DATA_BASE_URL + filename + "?raw=true", path)

Downloading train...
Downloading valid...
Downloading test...
Downloading vocab.intent...
Downloading vocab.slot...


In [3]:
lines_train = Path('train').read_text('utf-8').strip().splitlines()
print(f'First line of training set: {lines_train[0]}.')

First line of training set: Add:O Don:B-entity_name and:I-entity_name Sherri:I-entity_name to:O my:B-playlist_owner Meditate:B-playlist to:I-playlist Sounds:I-playlist of:I-playlist Nature:I-playlist playlist:O <=> AddToPlaylist.


In [4]:
def parse_line(line):
    utterance_data, intent_label = line.split(" <=> ")
    items = utterance_data.split()
    words = [item.rsplit(':', 1)[0] for item in items]
    word_labels = [item.rsplit(':', 1)[1] for item in items]
    return {
        'intent_label': intent_label,
        'words': " ".join(words),
        'words_label': " ".join(word_labels),
        'length': len(words)
    }

parse_line(lines_train[0])

{'intent_label': 'AddToPlaylist',
 'words': 'Add Don and Sherri to my Meditate to Sounds of Nature playlist',
 'words_label': 'O B-entity_name I-entity_name I-entity_name O B-playlist_owner B-playlist I-playlist I-playlist I-playlist I-playlist O',
 'length': 12}

In [5]:
print(Path('vocab.intent').read_text('utf-8'))
print(Path('vocab.slot').read_text('utf-8'))

AddToPlaylist
BookRestaurant
GetWeather
PlayMusic
RateBook
SearchCreativeWork
SearchScreeningEvent

B-album
B-artist
B-best_rating
B-city
B-condition_description
B-condition_temperature
B-country
B-cuisine
B-current_location
B-entity_name
B-facility
B-genre
B-geographic_poi
B-location_name
B-movie_name
B-movie_type
B-music_item
B-object_location_type
B-object_name
B-object_part_of_series_type
B-object_select
B-object_type
B-party_size_description
B-party_size_number
B-playlist
B-playlist_owner
B-poi
B-rating_unit
B-rating_value
B-restaurant_name
B-restaurant_type
B-served_dish
B-service
B-sort
B-spatial_relation
B-state
B-timeRange
B-track
B-year
I-album
I-artist
I-city
I-country
I-cuisine
I-current_location
I-entity_name
I-facility
I-genre
I-geographic_poi
I-location_name
I-movie_name
I-movie_type
I-music_item
I-object_location_type
I-object_name
I-object_part_of_series_type
I-object_select
I-object_type
I-party_size_description
I-playlist
I-playlist_owner
I-poi
I-restaurant_name
I-re

In [6]:
parsed = [parse_line(line) for line in lines_train]
df_train = pd.DataFrame([p for p in parsed if p is not None])
# Print some lines of the training set
df_train.head(5)

Unnamed: 0,intent_label,words,words_label,length
0,AddToPlaylist,Add Don and Sherri to my Meditate to Sounds of...,O B-entity_name I-entity_name I-entity_name O ...,12
1,AddToPlaylist,put United Abominations onto my rare groove pl...,O B-entity_name I-entity_name O B-playlist_own...,8
2,AddToPlaylist,add the tune by misato watanabe to the Trapeo ...,O O B-music_item O B-artist I-artist O O B-pla...,10
3,AddToPlaylist,add this artist to my this is miguel bosé play...,O O B-music_item O B-playlist_owner B-playlist...,10
4,AddToPlaylist,add heresy and the hotel choir to the evening ...,O B-entity_name I-entity_name I-entity_name I-...,11


In [7]:
# Count the number of lines by intent label
df_train.intent_label.value_counts()

intent_label
GetWeather              1900
PlayMusic               1900
BookRestaurant          1873
SearchScreeningEvent    1859
RateBook                1856
SearchCreativeWork      1854
AddToPlaylist           1842
Name: count, dtype: int64

In [8]:
# Get validation and test set
lines_validation = Path('valid').read_text('utf-8').strip().splitlines()
lines_test = Path('test').read_text('utf-8').strip().splitlines()

df_validation = pd.DataFrame([parse_line(line) for line in lines_validation])
df_test = pd.DataFrame([parse_line(line) for line in lines_test])

In [9]:
model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [10]:
first_sentence = df_train.iloc[0]['words']
print(first_sentence)

Add Don and Sherri to my Meditate to Sounds of Nature playlist


In [11]:
tokenizer.tokenize(first_sentence)

['Ad',
 '##d',
 'Don',
 'and',
 'She',
 '##rri',
 'to',
 'my',
 'Me',
 '##dit',
 '##ate',
 'to',
 'Sounds',
 'of',
 'Nature',
 'play',
 '##list']

In [12]:
# Encode sentence to id
tokenizer.encode(first_sentence)

[101,
 24930,
 1181,
 1790,
 1105,
 1153,
 14791,
 1106,
 1139,
 2508,
 17903,
 2193,
 1106,
 10560,
 1104,
 7009,
 1505,
 7276,
 102]

In [13]:
# Do the inverse operation
tokenizer.decode(tokenizer.encode(first_sentence))

'[CLS] Add Don and Sherri to my Meditate to Sounds of Nature playlist [SEP]'

In [14]:
# Get the items in BERT
bert_vocab_items = list(tokenizer.vocab.items())
# Print some examples of items
bert_vocab_items[250:260]

[('ä', 250),
 ('å', 251),
 ('æ', 252),
 ('ç', 253),
 ('è', 254),
 ('é', 255),
 ('ê', 256),
 ('ë', 257),
 ('ì', 258),
 ('í', 259)]

In [22]:
def encode_dataset(tokenizer, text_sequences, max_length):
    token_ids = np.zeros(shape=(len(text_sequences), max_length),
                         dtype=np.int32)
    for i, text_sequence in enumerate(text_sequences):
        encoded = tokenizer.encode(text_sequence)
        token_ids[i, 0:len(encoded)] = encoded
    attention_masks = (token_ids != 0).astype(np.int32)
    
    return {'input_ids': token_ids, 'attention_masks': attention_masks}

In [23]:
encoded_train = encode_dataset(tokenizer, df_train['words'], 45)
encoded_validation = encode_dataset(tokenizer, df_validation['words'], 45)
encoded_test = encode_dataset(tokenizer, df_test['words'], 45)

In [24]:
intent_names = Path('vocab.intent').read_text('utf-8').split()
intent_map = dict((label, idx) for idx, label in enumerate(intent_names))

In [25]:
intent_train = df_train['intent_label'].map(intent_map).values
intent_validation = df_validation['intent_label'].map(intent_map).values
intent_test = df_test['intent_label'].map(intent_map).values

In [26]:
base_bert_model = TFBertModel.from_pretrained('bert-base-cased')
base_bert_model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "tf_bert_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108310272 (413.17 MB)
Trainable params: 108310272 (413.17 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
outputs = base_bert_model(encoded_validation)
print(f'Shape of the first output of the BERT model: {outputs[0].shape}.')
print(f'Shape of the second output of the BERT model: {outputs[1].shape}.')

ValueError: Exception encountered when calling layer 'encoder' (type TFBertEncoder).

The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Call arguments received by layer 'encoder' (type TFBertEncoder):
  • hidden_states=tf.Tensor(shape=(700, 45, 768), dtype=float32)
  • attention_mask=tf.Tensor(shape=(700, 1, 1, 45), dtype=float32)
  • head_mask=['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None']
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None']
  • use_cache=False
  • output_attentions=tf.Tensor(shape=(700, 45), dtype=int32)
  • output_hidden_states=False
  • return_dict=True
  • training=False