<a href="https://colab.research.google.com/github/jakubglinka/google.colab/blob/master/BertSequenceClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification using BERT

 - training smaller Transformer from scratch
 - using pre-trained bert model
 - distil knowledge to smaller transformer

## Configure environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.__version__)

TensorFlow 2.x selected.
Num GPUs Available:  1
2.1.0-rc1


In [3]:
try:
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
except ValueError as error:
  print(error)
  print("No TPU available. Switching to single device strategy.")
  strategy = tf.distribute.OneDeviceStrategy(device="/gpu")

Please provide a TPU Name to connect to.
No TPU available. Switching to single device strategy.


In [0]:
!pip install transformers
!pip install git+https://jbglin:botrx56jtlp6p2cbsthvt3bkslgeo3pzc5c7iuu4irxscjmmc6xa@dev.azure.com/eyDataScienceTeam/_git/nlp-ey-assets@develop

## Data Preparation

### PolEmo data

In [0]:
import pandas as pd
import pathlib
import re
import tqdm
POLEMO_PATH = "./drive/My Drive/sentiment/"
from typing import List
import numpy as np

In [40]:
# read PolEmo data:
def read_polemo_data(path) -> pd.DataFrame:
  res = []
  with path.open("r") as f:
    for line in f:
      rec = line.strip("\n").split("__label__")
      rec[0] = rec[0].strip()
      res.append(rec)

  return pd.DataFrame(res, columns=["text", "label"])

df_train = read_polemo_data(pathlib.Path(POLEMO_PATH) / "all.sentence.train.txt")
print(f"Read {df_train.shape[0]} train examples.")

df_dev = read_polemo_data(pathlib.Path(POLEMO_PATH) / "all.sentence.dev.txt")
print(f"Read {df_dev.shape[0]} dev examples.")

df_test = read_polemo_data(pathlib.Path(POLEMO_PATH) / "all.sentence.test.txt")
print(f"Read {df_test.shape[0]} test examples.")

Read 45974 train examples.
Read 5747 dev examples.
Read 5745 test examples.


In [56]:
df_train.iloc[5745, :]

text        Niestety mam podobne odczucia jak poprzedniczka .
label                                               z_minus_m
n_tokens                                                    7
Name: 5745, dtype: object

In [0]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder().fit(df_train.label.values)

# add encoded labels:
df_train["label_enc"] = enc.transform(df_train["label"])
df_dev["label_enc"] = enc.transform(df_dev["label"])
df_test["label_enc"] = enc.transform(df_test["label"])

In [163]:
# check for number of tokens ditribution
df_train["n_tokens"] = df_train.text.apply(lambda x: len(x.split()))
np.quantile(df_train.n_tokens, q=[.9, .99, .999])

array([ 33.   ,  68.   , 128.027])

### Prepare `BertTokenizer`

In [0]:
from nlp.tokenizers import SentencePieceTokenizer
from transformers import BertTokenizer
from absl import logging
logging.set_verbosity(logging.INFO)

SP_TO_LOWER = False
SP_MAX_VOCAB_SIZE = 2000

In [165]:
# initialize tokenizer:
tokenizer = SentencePieceTokenizer(lower=SP_TO_LOWER, strip_accents=SP_TO_LOWER)

# prepare segments:
segments = list(df_train.text) + list(df_dev.text) + list(df_test.text)

# fit tokenizer:
tokenizer.fit_on_segments(segments=segments, max_vocab_size=SP_MAX_VOCAB_SIZE)

100%|██████████████████████████████████████████████████████████████████████████| 57466/57466 [00:05<00:00, 10660.15it/s]


In [166]:
# save vocabulary to disk in the format compatible with BertTokenizer:
BERT_SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[SEP]", "[CLS]", "[MASK]"]
vocab = tokenizer.get_vocabulary()

with pathlib.Path("./sentencepiece.vocab").open("w") as f:
  for token in BERT_SPECIAL_TOKENS:
    f.write(token + "\n")
  for idx, (word, __) in enumerate(vocab.items()):
    if idx > 3:
      if word.startswith("▁"):
        word = word.replace("▁", "")
      else:
        word = "##" + word
      f.write(word + "\n")

!head -n 7 ./sentencepiece.vocab

[PAD]
[UNK]
[SEP]
[CLS]
[MASK]
.
##a


In [167]:
# create new instance of BertTokenizer
bert_tokenizer = BertTokenizer("./sentencepiece.vocab", do_lower_case=SP_TO_LOWER, do_basic_tokenize=True)
print(bert_tokenizer.tokenize("Ala ma kota!"))

# encode plus:
bert_tokenizer.encode_plus("Ala ma kota!", max_length=10, pad_to_max_length=True)

['A', '##la', 'ma', 'ko', '##ta', '!']


{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
 'input_ids': [3, 152, 86, 83, 253, 125, 59, 2, 0, 0],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [0]:
# from transformers import BertTokenizer

# vocab = ["byli", "##śmy", "Byli", "##smy", "."]
# with open("/tmp/test.vocab", "w") as f:
#   for wordpiece in vocab:
#     f.write(wordpiece + "\n")

# bert_tokenizer = BertTokenizer("/tmp/test.vocab", do_basic_tokenize=True, do_lower_case=True)
# bert_tokenizer.tokenize("Byliśmy.")

### TFRecords

#### Prepare single Example

In [153]:
bert_tokenizer.tokenize("za późno")

['za', 'późn', '##o']

In [226]:
from typing import Dict
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# prepare data:
def _prepare_single_example(x: str, max_seq_len: int = None, label: int = 0) -> Dict:
  
  inputs = bert_tokenizer.encode_plus(x, max_length=max_seq_len, pad_to_max_length=False, return_tensors="tf")
  inputs["attention_mask"] = tf.squeeze(inputs["attention_mask"], axis=0)
  inputs["input_ids"] = tf.squeeze(inputs["input_ids"], axis=0)
  inputs["token_type_ids"] = tf.squeeze(inputs["token_type_ids"], axis=0)
  
  inputs["label"] = tf.convert_to_tensor(label)
  inputs["raw_text"] = tf.convert_to_tensor(x)

  return inputs

# serialize:
def _serialize_example(example):
  
  # attention mask
  example["attention_mask"] = tf.io.serialize_tensor(example["attention_mask"]).numpy()
  example["attention_mask"] = tf.train.Feature(bytes_list = tf.train.BytesList(value=[example["attention_mask"]]))
  
  # input ids
  example["input_ids"] = tf.io.serialize_tensor(example["input_ids"]).numpy()
  example["input_ids"] = tf.train.Feature(bytes_list = tf.train.BytesList(value=[example["input_ids"]]))

  # token type ids
  example["token_type_ids"] = tf.io.serialize_tensor(example["token_type_ids"]).numpy()
  example["token_type_ids"] = tf.train.Feature(bytes_list = tf.train.BytesList(value=[example["token_type_ids"]]))

  example["raw_text"] = tf.train.Feature(bytes_list = tf.train.BytesList(value=[example["raw_text"].numpy()]))
  example["label"] = tf.train.Feature(int64_list = tf.train.Int64List(value=[example["label"].numpy()]))

  ex = tf.train.Example(features=tf.train.Features(feature=example))
  return ex.SerializeToString()  

example = _prepare_single_example("Ala ma kota!", 64, 1)
example
tf.train.Example.FromString(_serialize_example(example))

features {
  feature {
    key: "attention_mask"
    value {
      bytes_list {
        value: "\010\003\022\004\022\002\010\010\" \001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000\001\000\000\000"
      }
    }
  }
  feature {
    key: "input_ids"
    value {
      bytes_list {
        value: "\010\003\022\004\022\002\010\010\" \003\000\000\000\230\000\000\000V\000\000\000S\000\000\000\375\000\000\000}\000\000\000;\000\000\000\002\000\000\000"
      }
    }
  }
  feature {
    key: "label"
    value {
      int64_list {
        value: 1
      }
    }
  }
  feature {
    key: "raw_text"
    value {
      bytes_list {
        value: "Ala ma kota!"
      }
    }
  }
  feature {
    key: "token_type_ids"
    value {
      bytes_list {
        value: "\010\003\022\004\022\002\010\010\" \000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
      }
    }


#### Prepare TFRecords

In [0]:
!rm ./train.TFRecord ./dev.TFRecord ./test.TFRecord

In [228]:
# Write the `tf.Example` observations to the file.

with tf.io.TFRecordWriter("./train.TFRecord") as writer:
  for __, row in tqdm.tqdm(df_train.iterrows()):
    example = _prepare_single_example(row.text, None, row.label_enc)
    writer.write(_serialize_example(example))

with tf.io.TFRecordWriter("./dev.TFRecord") as writer:
  for __, row in tqdm.tqdm(df_dev.iterrows()):
    example = _prepare_single_example(row.text, None, row.label_enc)
    writer.write(_serialize_example(example))

with tf.io.TFRecordWriter("./test.TFRecord") as writer:
  for __, row in tqdm.tqdm(df_test.iterrows()):
    example = _prepare_single_example(row.text, None, row.label_enc)
    writer.write(_serialize_example(example))


45974it [00:53, 854.15it/s]
5747it [00:06, 821.78it/s]
5745it [00:07, 819.88it/s]


In [229]:
!ls -la -h | grep TF

-rw-r--r-- 1 root root 3.9M Jan  3 20:39 dev.TFRecord
-rw-r--r-- 1 root root 3.9M Jan  3 20:39 test.TFRecord
-rw-r--r-- 1 root root  31M Jan  3 20:39 train.TFRecord


#### Copy to GC Bucket

In [230]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'southern-shard-211411'
!gcloud config set project {project_id}

Updated property [core/project].


In [231]:
# Upload the files to a given Google Cloud Storage bucket.
!gsutil cp ./train.TFRecord gs://tf_experiments_records/PolEmo/train.TFRecord
!gsutil cp ./dev.TFRecord gs://tf_experiments_records/PolEmo/dev.TFRecord
!gsutil cp ./test.TFRecord gs://tf_experiments_records/PolEmo/test.TFRecord

Copying file://./train.TFRecord [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/30.6 MiB.                                     
Copying file://./dev.TFRecord [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/3.9 MiB.                                      
Copying file://./test.TFRecord [Content-Type=application/octet-stream]...
/ [1 files][  3.8 MiB/  3.8 MiB]                                                
Operation completed over 1 objects/3.8 MiB.                                      


### Read from GC Bucket

In [5]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'southern-shard-211411'
!gcloud config set project {project_id}

Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey



In [8]:
# Set access for the TPU pod:
!gsutil acl ch -u service-495559152420@cloud-tpu.iam.gserviceaccount.com:READER gs://tf_experiments_records/PolEmo/train.TFRecord
!gsutil acl ch -u service-495559152420@cloud-tpu.iam.gserviceaccount.com:READER gs://tf_experiments_records/PolEmo/dev.TFRecord
!gsutil acl ch -u service-495559152420@cloud-tpu.iam.gserviceaccount.com:READER gs://tf_experiments_records/PolEmo/test.TFRecord

Updated ACL on gs://tf_experiments_records/PolEmo/train.TFRecord
Updated ACL on gs://tf_experiments_records/PolEmo/dev.TFRecord
Updated ACL on gs://tf_experiments_records/PolEmo/test.TFRecord


In [6]:
# Create a dictionary describing the features.
_feature_description = {
    'attention_mask': tf.io.FixedLenFeature([], tf.string),
    'input_ids': tf.io.FixedLenFeature([], tf.string),
    'token_type_ids': tf.io.FixedLenFeature([], tf.string),
    'label': tf.io.FixedLenFeature([], tf.int64)
}

def _parse_data(example_proto, max_seq_len: int = 128):
  # Parse the input tf.Example proto using the dictionary above.
  rec = tf.io.parse_single_example(example_proto, _feature_description)
  
  # attention_mask:
  rec["attention_mask"] = tf.io.parse_tensor(rec["attention_mask"], out_type=tf.int32)
  rec["attention_mask"] = rec["attention_mask"][:max_seq_len]
  n_tokens = tf.shape(rec["attention_mask"])[0]
  padding = max_seq_len - n_tokens
  rec["attention_mask"] = tf.pad(rec["attention_mask"], paddings=[[0, padding]])

  # input_ids:
  rec["input_ids"] = tf.io.parse_tensor(rec["input_ids"], out_type=tf.int32)
  rec["input_ids"] = rec["input_ids"][:max_seq_len]
  rec["input_ids"] = tf.pad(rec["input_ids"], paddings=[[0, padding]])
  
  # token_type_ids
  rec["token_type_ids"] = tf.io.parse_tensor(rec["token_type_ids"], out_type=tf.int32)
  rec["token_type_ids"] = rec["token_type_ids"][:max_seq_len]
  rec["token_type_ids"] = tf.pad(rec["token_type_ids"], paddings=[[0, padding]])
 
  # shape bug?
  # rec["text/embedding"] = tf.reshape(rec["text/embedding"], [64, 256])
  rec["attention_mask"] = tf.reshape(rec["attention_mask"], [max_seq_len, ])
  rec["input_ids"] = tf.reshape(rec["input_ids"], [max_seq_len, ])
  rec["token_type_ids"] = tf.reshape(rec["token_type_ids"], [max_seq_len, ])
 
  labels = tf.one_hot(rec["label"], depth=4)
  inputs = rec
  inputs.pop("label")

  return inputs, labels

train_raw = tf.data.TFRecordDataset("gs://tf_experiments_records/PolEmo/train.TFRecord", num_parallel_reads=4)
example_proto = next(iter(train_raw))
_parse_data(example_proto, max_seq_len=12)

({'attention_mask': <tf.Tensor: shape=(12,), dtype=int32, numpy=array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)>,
  'input_ids': <tf.Tensor: shape=(12,), dtype=int32, numpy=
  array([  3, 778,  31, 232,  50,  73,  26,   8, 796, 224,  16, 165],
        dtype=int32)>,
  'token_type_ids': <tf.Tensor: shape=(12,), dtype=int32, numpy=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>},
 <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0., 0., 0., 1.], dtype=float32)>)

In [7]:
BATCH_SIZE = 32

train_raw = tf.data.TFRecordDataset("gs://tf_experiments_records/PolEmo/train.TFRecord", num_parallel_reads=4)
train_parsed = train_raw.map(_parse_data).batch(BATCH_SIZE, drop_remainder=True).shuffle(1024)
train_parsed = train_parsed.prefetch(-1)

inputs, labels = next(iter(train_parsed))

dev_raw = tf.data.TFRecordDataset("gs://tf_experiments_records/PolEmo/dev.TFRecord", num_parallel_reads=4)
dev_parsed = dev_raw.map(_parse_data).batch(BATCH_SIZE, drop_remainder=True)
dev_parsed = dev_parsed.prefetch(-1)

test_raw = tf.data.TFRecordDataset("gs://tf_experiments_records/PolEmo/test.TFRecord", num_parallel_reads=4)
test_parsed = test_raw.map(_parse_data).batch(BATCH_SIZE, drop_remainder=True)
test_parsed = test_parsed.prefetch(-1)

inputs

{'attention_mask': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'input_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
 array([[   3,  837,   84, ...,    0,    0,    0],
        [   3,  129, 1510, ...,    0,    0,    0],
        [   3,  115,   33, ...,    0,    0,    0],
        ...,
        [   3, 1361,    9, ...,    0,    0,    0],
        [   3,   63,  208, ...,    0,    0,    0],
        [   3,  153,    9, ...,    0,    0,    0]], dtype=int32)>,
 'token_type_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>}

## Small Transformer

### Architecture

![alt text](https://miro.medium.com/max/3740/1*hC4zIxPPK9KGDu-OYUfnCQ.png)

### Settings

*   vocab_size: 4000
*   hidden_size: 256
*   num_hidden_layers: 4
*   num_attention_heads: 4
*   intermediate_size: 1024
*   max_position_embeddings: 128

### `BertConfig`

In [0]:
from transformers import BertConfig
from transformers import TFBertForSequenceClassification

# new config
config = BertConfig(vocab_size=2000, 
                    hidden_size=128, 
                    num_attention_heads=4, 
                    num_hidden_layers=4, 
                    intermediate_size=256, 
                    max_position_embeddings=128,
                    num_labels=4)

In [0]:
with strategy.scope():
  tf.random.set_seed(1234)
  model = TFBertForSequenceClassification(config)

  initial_learning_rate = 1e-3
  lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=100,
    decay_rate=0.9,
    staircase=True)

  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), 
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
                metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [0]:
model.fit(train_parsed, validation_data=dev_parsed, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3
 136/1436 [=>............................] - ETA: 57s - loss: 0.7348 - categorical_accuracy: 0.7273

In [14]:
model.summary()
!nvidia-smi

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  819328    
_________________________________________________________________
dropout_27 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  516       
Total params: 819,844
Trainable params: 819,844
Non-trainable params: 0
_________________________________________________________________


### Model accuracy

In [0]:
from sklearn.metrics import classification_report
import numpy as np

# y_pred
y_pred = model.predict(test_parsed)
y_pred = np.argmax(y_pred, axis=1)
y_pred

# y_true
y_true = []  
for __, labels in test_parsed:
  y_true += list(np.argmax(labels.numpy(), axis=1))

In [40]:
print(classification_report(y_true=y_true, y_pred=y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.582     0.334     0.425       679
           1      0.648     0.802     0.717      2119
           2      0.717     0.679     0.697      1515
           3      0.719     0.650     0.683      1415

    accuracy                          0.677      5728
   macro avg      0.666     0.616     0.630      5728
weighted avg      0.676     0.677     0.669      5728



In [0]:
# prediction examples:


## MultiHead Attention

## With SP Tokenisation 

## Finetune BERT

In [0]:
from transformers.modeling_tf_bert import TFBertForSequenceClassification
from transformers import BertTokenizer, TFBertForSequenceClassification

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [0]:
with strategy.scope():
  model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=4)

  initial_learning_rate = 1e-4
  lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.9,
    staircase=True)

  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), 
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
                metrics=[tf.keras.metrics.CategoricalAccuracy()])


In [0]:
from sklearn.preprocessing import OneHotEncoder
TRAINING_EXAMPLES = 1000

train_texts = df_train.text.values
train_labels = df_train.label_enc

# create tensorflow dataset:
train_texts_encoded = [tokenizer.encode_plus(text, max_length=128, pad_to_max_length=True) 
                       for text in train_texts[:TRAINING_EXAMPLES]]

train_text_dict = pd.DataFrame(train_texts_encoded)
train_text_dict = {"input_ids": np.vstack(train_text_dict.input_ids.apply(np.array).values), 
                   "atttention_mask": np.vstack(train_text_dict.attention_mask.apply(np.array).values)}

y = np.array(train_labels[:TRAINING_EXAMPLES])
y = y.reshape(-1, 1)
enc = OneHotEncoder().fit(y)



In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [0]:
dataset = tf.data.Dataset.from_tensor_slices((train_text_dict, enc.transform(X=y).todense())).shuffle(1024).batch(64, drop_remainder=True)

In [0]:
model.fit(dataset, steps_per_epoch=625, epochs=5)

Train for 625 steps
Epoch 1/5
  1/625 [..............................] - ETA: 6:50:31

UnavailableError: ignored