<a href="https://colab.research.google.com/github/jakubglinka/google.colab/blob/master/NLP/supervised/SequenceClassificationWithAttention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sequence classification using simple Attention 
 - Positional encoding with additional [CLS] token
 - Pre-trained SP embeddings
 - Small multihead attention model

## Configure environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.__version__)

TensorFlow 2.x selected.
Num GPUs Available:  0
2.1.0-rc1


In [3]:
try:
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
except ValueError as error:
  print(error)
  print("No TPU available. Switching to single device strategy.")
  strategy = tf.distribute.OneDeviceStrategy(device="/cpu")

Please provide a TPU Name to connect to.
No TPU available. Switching to single device strategy.


In [0]:
!pip install transformers
!pip install git+https://jbglin:botrx56jtlp6p2cbsthvt3bkslgeo3pzc5c7iuu4irxscjmmc6xa@dev.azure.com/eyDataScienceTeam/_git/nlp-ey-assets@develop

## Data Preparation

### PolEmo data

In [0]:
import pandas as pd
import pathlib
import re
import tqdm
POLEMO_PATH = "./drive/My Drive/sentiment/"
from typing import List
import numpy as np

In [5]:
# read PolEmo data:
def read_polemo_data(path) -> pd.DataFrame:
  res = []
  with path.open("r") as f:
    for line in f:
      rec = line.strip("\n").split("__label__")
      rec[0] = rec[0].strip()
      res.append(rec)

  return pd.DataFrame(res, columns=["text", "label"])

df_train = read_polemo_data(pathlib.Path(POLEMO_PATH) / "all.sentence.train.txt")
print(f"Read {df_train.shape[0]} train examples.")

df_dev = read_polemo_data(pathlib.Path(POLEMO_PATH) / "all.sentence.dev.txt")
print(f"Read {df_dev.shape[0]} dev examples.")

df_test = read_polemo_data(pathlib.Path(POLEMO_PATH) / "all.sentence.test.txt")
print(f"Read {df_test.shape[0]} test examples.")

Read 45974 train examples.
Read 5747 dev examples.
Read 5745 test examples.


In [0]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder().fit(df_train.label.values)

# add encoded labels:
df_train["label_enc"] = enc.transform(df_train["label"])
df_dev["label_enc"] = enc.transform(df_dev["label"])
df_test["label_enc"] = enc.transform(df_test["label"])

## Multihead attention

### Architecture

![alt text](https://miro.medium.com/max/3740/1*hC4zIxPPK9KGDu-OYUfnCQ.png)

### Settings

 - number of sentence piece tokens: 32000
 - 

### Prepare Embedder

In [8]:
import numpy as np
from absl import logging
from nlp.models.embeddings import SentencePiece2Vec, load_embedder

# set verbosity:
logging.set_verbosity(logging.INFO)

# load embeder:
embedder = load_embedder("./drive/My Drive/embeddings/SP2VEC_UNIGRAM_VS=32k_ED=256_NS=5_CS=10_SP=TRUE_LANG=PL")

INFO:absl:Loading embedder from drive/My Drive/embeddings/SP2VEC_UNIGRAM_VS=32k_ED=256_NS=5_CS=10_SP=TRUE_LANG=PL
INFO:absl:Loading tokenizer...
INFO:absl:Loading embedder parameters...
INFO:absl:Loading words counter...
INFO:absl:Restoring Tensorflow model...
INFO:absl:Loading model weights from drive/My Drive/embeddings/SP2VEC_UNIGRAM_VS=32k_ED=256_NS=5_CS=10_SP=TRUE_LANG=PL/model.h5...
INFO:absl:41 out of 32000 SentencePieces had not been seen during training...


In [9]:
# test tokenizer:
embedder.tokenizer.tokenize("Ala ma kota!")

[['▁Ala'], ['▁ma'], ['▁kot', 'a'], ['▁!']]

### Prepare TFRecords

#### Prepare single Example

In [10]:
embedder.embed("Ala ma kota!")

array([[-0.55768859,  0.27311903, -0.33694631, ...,  0.08220041,
        -0.0613557 , -0.38406998],
       [-0.03268599, -0.08437294,  0.21085401, ...,  0.37722936,
        -0.03566481,  0.26155886],
       [ 0.16951726,  0.40455094,  0.0235016 , ..., -0.01163975,
        -0.25137361, -0.11694135],
       [ 0.11200984,  0.1492672 ,  0.4651545 , ...,  0.35221201,
         0.11847479,  0.43346739]])

In [11]:
from typing import Dict

# prepare data:
def _prepare_single_example(x: str, max_seq_len: int = None, label: int = 0) -> np.array:
  res = embedder.embed(text=x)
  res = np.array(res, dtype="float32")
  inputs = {}
  if max_seq_len is not None:
    res = res[:max_seq_len, :]
    n_tokens = res.shape[0]
    inputs["text/embedding"] = np.pad(res, ((0, max_seq_len - n_tokens),(0, 0)))
  else:
    inputs["text/embedding"] = res
    n_tokens = res.shape[0]
    
  inputs["text/embedding"] = tf.convert_to_tensor(inputs["text/embedding"], dtype=tf.float32)
  inputs["text/seq_len"] = tf.convert_to_tensor(n_tokens, dtype=tf.int32)
  inputs["text/label"] = tf.convert_to_tensor(label, dtype=tf.int32)
  return inputs

# serialize:
def _serialize_example(example):
  example["text/embedding"] = tf.io.serialize_tensor(example["text/embedding"]).numpy()
  example["text/embedding"] = tf.train.Feature(bytes_list = tf.train.BytesList(value=[example["text/embedding"]]))

  example["text/seq_len"] = tf.train.Feature(int64_list = tf.train.Int64List(value=[example["text/seq_len"].numpy()]))
  example["text/label"] = tf.train.Feature(int64_list = tf.train.Int64List(value=[example["text/label"].numpy()]))

  ex = tf.train.Example(features=tf.train.Features(feature=example))
  return ex.SerializeToString()  

example = _prepare_single_example("Ala ma kota!", 64, 1)
tf.train.Example.FromString(_serialize_example(example))

features {
  feature {
    key: "text/embedding"
    value {
      bytes_list {
        value: "\010\001\022\t\022\002\010@\022\003\010\200\002\"\200\200\004\256\304\016\277B\326\213>:\204\254\276\203?\017\276\300\213L?/\350y>\354\301\360=\270V\030\277\355\354\235\276w\274O\2749\016\202>IR\232\276\203\335\377=\003\320:\276|\005\007>a\263\377\274Zqd>v\364i>\277\355\267=\263\370\252\276\013\236\220>\357\231:\276G!\032\277^\031\351=\340o\273\275!\340\215=F\360E\276\202\314\177=)m\216\273\210\001k\276[^\363\275\241\262\030\277\34054>X\361\347\276\362U\224\276W\t7?tf\363=\362\366\307\276\360\007\342=\000\367\024\277D\3250?\273\252|=\222\205\363\276\365f\3239`\001\003?[\372\210\276y\374\253>\251\035B>\3115\250>\263\240P\275\235B\332;\342%m>\212\022j?1GM\276\361\013\334=\210\372\031>\301\237\013>pj*>\262\240\'?\255\342\324\276\r\366\317\276\020\236\013\277.\322\014\277\252D\001?\0300\362>\274\350<\276\300>\306=\033\333\277>\327\017\204>\266q\277=\374\204\320\276\273\261\373\276\253\0065\277-\

#### Save as TFRecords

In [12]:
!rm ./train.TFRecord ./dev.TFRecord ./test.TFRecord

rm: cannot remove './train.TFRecord': No such file or directory
rm: cannot remove './dev.TFRecord': No such file or directory
rm: cannot remove './test.TFRecord': No such file or directory


In [13]:
# Write the `tf.Example` observations to the file.

with tf.io.TFRecordWriter("./train.TFRecord") as writer:
  for __, row in tqdm.tqdm(df_train.iterrows()):
    example = _prepare_single_example(row.text, None, row.label_enc)
    writer.write(_serialize_example(example))

with tf.io.TFRecordWriter("./dev.TFRecord") as writer:
  for __, row in tqdm.tqdm(df_dev.iterrows()):
    example = _prepare_single_example(row.text, None, row.label_enc)
    writer.write(_serialize_example(example))

with tf.io.TFRecordWriter("./test.TFRecord") as writer:
  for __, row in tqdm.tqdm(df_test.iterrows()):
    example = _prepare_single_example(row.text, None, row.label_enc)
    writer.write(_serialize_example(example))


45974it [00:42, 1089.81it/s]
5747it [00:05, 1067.52it/s]
5745it [00:05, 1083.02it/s]


In [14]:
!ls -la -h | grep TF

-rw-r--r-- 1 root root 103M Jan  6 19:04 dev.TFRecord
-rw-r--r-- 1 root root 102M Jan  6 19:04 test.TFRecord
-rw-r--r-- 1 root root 808M Jan  6 19:04 train.TFRecord


#### Copy to GC Bucket

In [15]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'southern-shard-211411'
!gcloud config set project {project_id}

Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey



In [16]:
# Upload the files to a given Google Cloud Storage bucket.
!gsutil cp ./train.TFRecord gs://tf_experiments_records/PolEmo/embedded/train.TFRecord
!gsutil cp ./dev.TFRecord gs://tf_experiments_records/PolEmo/embedded/dev.TFRecord
!gsutil cp ./test.TFRecord gs://tf_experiments_records/PolEmo/embedded/test.TFRecord


Copying file://./train.TFRecord [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/808.0 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

|
Operation completed over 1 objects/808.0 MiB.                                    
Copying file://./dev.TFRecord [Content-Type=application/octet-stream]...
\
Operation

### Prepare Dataset

In [0]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'southern-shard-211411'
!gcloud config set project {project_id}

In [0]:
# give access for TPU Pod
!gsutil acl ch -u service-495559152420@cloud-tpu.iam.gserviceaccount.com:READER gs://tf_experiments_records/PolEmo/embedded/train.TFRecord
!gsutil acl ch -u service-495559152420@cloud-tpu.iam.gserviceaccount.com:READER gs://tf_experiments_records/PolEmo/embedded/dev.TFRecord
!gsutil acl ch -u service-495559152420@cloud-tpu.iam.gserviceaccount.com:READER gs://tf_experiments_records/PolEmo/embedded/test.TFRecord

In [59]:
# Create a dictionary describing the features.
_feature_description = {
    'text/embedding': tf.io.FixedLenFeature([], tf.string),
    'text/label': tf.io.FixedLenFeature([], tf.int64),
    'text/seq_len': tf.io.FixedLenFeature([], tf.int64)
}

def _parse_data(example_proto, max_seq_len: int = 128):
  # Parse the input tf.Example proto using the dictionary above.
  rec = tf.io.parse_single_example(example_proto, _feature_description)
  
  # attention_mask:
  rec["text/embedding"] = tf.io.parse_tensor(rec["text/embedding"], out_type=tf.float32)
  rec["text/embedding"] = rec["text/embedding"][:max_seq_len, :]
  n_tokens = tf.shape(rec["text/embedding"])[0]
  padding = max_seq_len - n_tokens
  rec["text/embedding"] = tf.pad(rec["text/embedding"], paddings=[[0, padding], [0, 0]])

  # shape bug?
  rec["text/embedding"] = tf.reshape(rec["text/embedding"], [max_seq_len, 256])
  
  labels = tf.one_hot(rec["text/label"], depth=4)
  inputs = rec
  inputs.pop("text/label")

  return inputs, labels

train_raw = tf.data.TFRecordDataset("gs://tf_experiments_records/PolEmo/embedded/train.TFRecord", num_parallel_reads=4)
example_proto = next(iter(train_raw))
_parse_data(example_proto, max_seq_len=12)

({'text/embedding': <tf.Tensor: shape=(12, 256), dtype=float32, numpy=
  array([[-2.1275625e-01,  2.1668257e-01,  4.2006724e-02, ...,
           2.8739952e-02,  5.7239354e-01,  5.1114595e-01],
         [ 1.4975270e+00, -2.6114643e-01, -8.2025570e-01, ...,
          -1.4366919e-01,  2.5677538e-01,  1.0759610e+00],
         [-1.2762208e-01,  2.6893950e-01, -4.1812029e-01, ...,
           3.6373354e-02, -2.8747786e-02,  2.8844136e-01],
         ...,
         [ 2.2510141e-01,  2.5288799e-01,  1.7103653e-01, ...,
           6.7523316e-02,  2.5973466e-01,  3.4373689e-01],
         [ 8.5347988e-02, -1.4168697e-02,  2.0092191e-02, ...,
          -4.9609572e-02, -2.6992057e-06,  2.7695408e-03],
         [-1.3251345e-01,  1.5071502e-01, -8.8039011e-02, ...,
           2.5849009e-01, -7.6398574e-02,  4.7453195e-01]], dtype=float32)>,
  'text/seq_len': <tf.Tensor: shape=(), dtype=int64, numpy=18>},
 <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0., 0., 0., 1.], dtype=float32)>)

In [60]:
BATCH_SIZE = 8*64
STEPS_PER_EPOCH = int(np.floor(45000 / BATCH_SIZE))
VALIDATION_STEPS = int(np.floor(5700 / BATCH_SIZE))

print(f"STEPS_PER_EPOCH: {STEPS_PER_EPOCH}")
print(f"VALIDATION_STEPS: {VALIDATION_STEPS}")

train_raw = tf.data.TFRecordDataset("gs://tf_experiments_records/PolEmo/embedded/train.TFRecord", num_parallel_reads=1)
train_parsed = train_raw.map(_parse_data).batch(BATCH_SIZE, drop_remainder=True).repeat(100)
train_parsed = train_parsed.prefetch(-1)

inputs, labels = next(iter(train_parsed))

dev_raw = tf.data.TFRecordDataset("gs://tf_experiments_records/PolEmo/embedded/dev.TFRecord", num_parallel_reads=1)
dev_parsed = dev_raw.map(_parse_data).batch(BATCH_SIZE, drop_remainder=True)
dev_parsed = dev_parsed.prefetch(-1)

test_raw = tf.data.TFRecordDataset("gs://tf_experiments_records/PolEmo/embedded/test.TFRecord", num_parallel_reads=1)
test_parsed = test_raw.map(_parse_data).batch(BATCH_SIZE, drop_remainder=True)
test_parsed = test_parsed.prefetch(-1)

# inputs

STEPS_PER_EPOCH: 87
VALIDATION_STEPS: 11


### Create model

#### ModelConfig

In [0]:
from transformers import BertConfig

class ModelConfig(BertConfig):

  def __init__(self, seed, **kwargs):
    super(ModelConfig, self).__init__(**kwargs)
    self.seed = seed
    
# new config
config = ModelConfig(vocab_size=10000, output_hidden_states=False,
                    hidden_size=256, 
                    num_attention_heads=4, 
                    num_hidden_layers=4, 
                    intermediate_size=512, 
                    max_position_embeddings=128,
                    num_labels=4, 
                    hidden_dropout_prob=0.1, 
                    attention_probs_dropout_prob=0.1, seed=1234)

# config

#### Embedding layer

In [63]:
# https://github.com/huggingface/transformers/blob/645713e2cb8307e41febb2b7c9f6036f6645efce/transformers/modeling_tf_bert.py#L93

class PositionalEmbedding(tf.keras.layers.Layer):
  """Enrich embeddings with positional encoding."""

  def __init__(self, config, **kwargs):
    super(PositionalEmbedding, self).__init__(name="PositonalEmbedding", **kwargs)
    self.embedding_dim = config.hidden_size
    self.positional_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, 
                                                         config.hidden_size,
                                                         embeddings_initializer=tf.keras.initializers.GlorotUniform(seed=config.seed),
                                                         name="weights")
    self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
    self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob, name="dropout")

  def build(self, input_shape):
    with tf.name_scope("positional_embeddings"):
      self.cls_embedding = self.add_weight("cls_embedding", 
                                           shape=[1, 1, config.hidden_size], 
                                           initializer=tf.keras.initializers.GlorotUniform(seed=config.seed+1))
      self.positional_embeddings.build(input_shape)
    super(PositionalEmbedding, self).build(input_shape)

  def call(self, inputs, training=False):

    position_ids = tf.range(tf.shape(inputs["text/embedding"])[1])
    emb = inputs["text/embedding"] + self.positional_embeddings(position_ids)
    emb = tf.concat([tf.repeat(self.cls_embedding, tf.shape(emb)[0],axis=0), emb], axis=1)
    emb = self.layer_norm(emb)
    emb = self.dropout(emb, training=training)
    
    return emb

pe = PositionalEmbedding(config)
pe.build([None, 128, 256])
emb = pe.call(inputs)
emb.shape

TensorShape([512, 129, 256])

#### Multihead attention

In [77]:
from transformers.modeling_tf_bert import TFBertEncoder, shape_list
encoder = TFBertEncoder(config)

# head mask:
head_mask = [None] * config.num_hidden_layers
head_mask

# attention mask:
attention_mask = tf.sequence_mask(lengths=inputs["text/seq_len"]+1, maxlen=129, dtype=tf.float32)
extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
extended_attention_mask

# encoder:
encoder([emb, extended_attention_mask, head_mask])



(<tf.Tensor: shape=(512, 129, 256), dtype=float32, numpy=
 array([[[-1.6703742e+00,  1.3827596e+00,  1.6873264e+00, ...,
          -5.5346417e-01,  2.2017308e-01,  1.0798774e+00],
         [-3.9335343e-01,  7.3320627e-01,  1.1309557e-02, ...,
           1.8624063e-01,  1.2231289e+00,  1.2263955e+00],
         [ 2.6031475e+00, -4.1109642e-01, -1.0421672e+00, ...,
          -1.3593070e-01,  3.2828045e-01,  1.8283193e+00],
         ...,
         [-1.2796075e+00,  1.2156439e+00, -1.2634727e+00, ...,
           1.2903150e+00,  8.9674455e-01, -2.9596120e-01],
         [-7.0081168e-01, -3.7575454e-01,  6.2811577e-01, ...,
           1.6203120e+00,  1.3069722e-01, -3.6707290e-02],
         [ 1.0926259e+00, -1.5183010e+00, -1.3988708e+00, ...,
          -8.0997896e-01, -1.1907557e+00, -1.3450123e+00]],
 
        [[-1.6151129e+00,  1.3785655e+00,  1.6752311e+00, ...,
          -4.6181589e-01,  3.6049491e-01,  1.0645559e+00],
         [ 5.6881356e-01, -5.2409679e-01, -4.1481832e-01, ...,
        