<a href="https://colab.research.google.com/github/iamksseo/NeuralLog/blob/main/test/west_colab_gpu_Transformer_based_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Mon Aug 29 01:33:20 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
pip install -q tf-models-official

[K     |████████████████████████████████| 2.1 MB 8.2 MB/s 
[K     |████████████████████████████████| 99 kB 7.7 MB/s 
[K     |████████████████████████████████| 4.6 MB 59.1 MB/s 
[K     |████████████████████████████████| 636 kB 70.7 MB/s 
[K     |████████████████████████████████| 238 kB 72.6 MB/s 
[K     |████████████████████████████████| 511.7 MB 5.5 kB/s 
[K     |████████████████████████████████| 1.3 MB 27.7 MB/s 
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 57.5 MB/s 
[K     |████████████████████████████████| 352 kB 69.9 MB/s 
[K     |████████████████████████████████| 116 kB 72.6 MB/s 
[K     |████████████████████████████████| 5.8 MB 51.4 MB/s 
[K     |████████████████████████████████| 438 kB 62.2 MB/s 
[K     |████████████████████████████████| 1.6 MB 60.9 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
import pickle

In [4]:
import numpy as np
from tensorflow.keras.utils import Sequence
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from official.nlp import optimization

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
from sklearn.utils import shuffle
from sklearn.metrics import classification_report

In [7]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# II. Transformer

In [8]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [9]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
 
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [10]:
class PositionEmbedding(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(PositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        # self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
        self.pos_encoding = positional_encoding(max_len,
                                                embed_dim)
 
    def call(self, x):
        # x = self.token_emb(x)
        seq_len = tf.shape(x)[1]
        # print(maxlen)
        x += self.pos_encoding[:, :seq_len, :]
        # positions = tf.range(start=0, limit=maxlen, delta=1)
        # positions = self.pos_emb(positions)
        # print(x.shape, positions.shape)
        # x = self.token_emb(x)
        return x

In [11]:
embed_dim = 768  # Embedding size for each token
num_heads = 12  # Number of attention heads
ff_dim = 2048  # Hidden layer size in feed forward network inside transformer
max_len = 75
num_layers = 1

In [12]:
def transformer_classifer(input_size, loss_object, optimizer, dropout=0.1):
    inputs = layers.Input(shape=(max_len, embed_dim))
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    embedding_layer = PositionEmbedding(100, 2000, embed_dim)
    # print(inputs.shape)
    x = embedding_layer(inputs)
    # print(x.shape)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(32, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(2, activation="softmax")(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss=loss_object, metrics=['accuracy'],
                  optimizer=optimizer)
    return model

# Training/Testing

In [13]:
class BatchGenerator(Sequence):
 
    def __init__(self, X, Y, batch_size):
        self.X, self.Y = X, Y
        self.batch_size = batch_size
 
    def __len__(self):
        return int(np.ceil(len(self.X) / float(self.batch_size)))
 
    def __getitem__(self, idx):
        # print(self.batch_size)
        dummy = np.zeros(shape=(embed_dim,))
        x = self.X[idx * self.batch_size:min((idx + 1) * self.batch_size, len(self.X))]
        X = np.zeros((len(x), max_len, embed_dim))
        Y = np.zeros((len(x), 2))
        item_count = 0
        for i in range(idx * self.batch_size, min((idx + 1) * self.batch_size, len(self.X))):
            x = self.X[i]
            if len(x) > max_len:
                x = x[-max_len:]
            x = np.pad(np.array(x), pad_width=((max_len - len(x), 0), (0, 0)), mode='constant',
                       constant_values=0)
            X[item_count] = np.reshape(x, [max_len, embed_dim])
            Y[item_count] = self.Y[i]
            item_count += 1
        return X[:], Y[:, 0]

In [14]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
 
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
 
        self.warmup_steps = warmup_steps
 
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
 
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [15]:
def train_generator(training_generator, validate_generator, num_train_samples, num_val_samples, batch_size,
                      epoch_num, model_name=None):
  
    # learning_rate = CustomSchedule(768)
 
    # optim = tf.keras.optimizers.Adam(learning_rate)
    
    optim = Adam()
    epochs = epoch_num
    steps_per_epoch = num_train_samples 
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)
 
    init_lr = 3e-4
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                              num_train_steps=num_train_steps,
                                              num_warmup_steps=num_warmup_steps,
                                              optimizer_type='adamw')
    
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
 
    model = transformer_classifer(768, loss_object, optimizer)
 
    # model.load_weights("hdfs_transformer.hdf5")
 
    print(model.summary())
 
    # checkpoint
    filepath = model_name
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max',
                                 save_weights_only=True)
    early_stop = EarlyStopping(
        monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto',
        baseline=None, restore_best_weights=True
    )
    callbacks_list = [checkpoint, early_stop]
    
    # class_weight = {0: 245., 1: 1.}
 
    model.fit_generator(generator=training_generator,
                        steps_per_epoch=int(num_train_samples / batch_size),
                        epochs=epoch_num,
                        verbose=1,
                        validation_data=validate_generator,
                        validation_steps=int(num_val_samples / batch_size),
                        workers=16,
                        max_queue_size=32,
                        callbacks=callbacks_list,
                        shuffle=True
                        # class_weight=class_weight
                        )
    return model

In [16]:
def train(X, Y, epoch_num, batch_size, tx, ty, model_file=None):
    X, Y = shuffle(X, Y)
    n_samples = len(X)
    train_x, train_y = X[:int(n_samples * 90 / 100)], Y[:int(n_samples * 90 / 100)]
    val_x, val_y = X[int(n_samples * 90 / 100):], Y[int(n_samples * 90 / 100):]
    
    training_generator, num_train_samples = BatchGenerator(train_x, train_y, batch_size), len(train_x)
    validate_generator, num_val_samples = BatchGenerator(val_x, val_y, batch_size), len(val_x)
 
    print("Number of training samples: {0} - Number of validating samples: {1}".format(num_train_samples,
                                                                                       num_val_samples))
 
    model = train_generator(training_generator, validate_generator, num_train_samples, num_val_samples, batch_size,
                              epoch_num, model_name=model_file)
    test_model(model, tx, ty, batch_size)
 
 
def test_model(model, x, y, batch_size):
    x, y = shuffle(x, y)
    x, y = x[: len(x) // batch_size * batch_size], y[: len(y) // batch_size * batch_size]
    test_loader = BatchGenerator(x, y, batch_size)
    prediction = model.predict_generator(test_loader, steps=(len(x) // batch_size), workers=16, max_queue_size=32,
                                         verbose=1)
    prediction = np.argmax(prediction, axis=1)
    y = y[:len(prediction)]
    report = classification_report(np.array(y), prediction)
    print(report)

In [17]:
from collections import Counter

In [18]:
with open("neural-train.pkl", mode="rb") as f:
    (x_tr, y_tr) = pickle.load(f)
x_tr, y_tr = shuffle(x_tr, y_tr)
print(Counter(y_tr))
with open("neural-test.pkl", mode="rb") as f:
    (x_te, y_te) = pickle.load(f)
print(Counter(y_te))
print("Data loaded")

Counter({0: 446559, 1: 13489})
Counter({0: 111664, 1: 3349})
Data loaded


In [None]:
train(x_tr, y_tr, 20, 64, x_te, y_te, "west_hdfs_transformer.hdf5")

Number of training samples: 414043 - Number of validating samples: 46005
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 75, 768)]         0         
                                                                 
 position_embedding (Positio  (None, 75, 768)          0         
 nEmbedding)                                                     
                                                                 
 transformer_block (Transfor  (None, 75, 768)          31491584  
 merBlock)                                                       
                                                                 
 global_average_pooling1d (G  (None, 768)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 768)              



Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.98810, saving model to west_hdfs_transformer.hdf5
Epoch 2/20
Epoch 2: val_accuracy improved from 0.98810 to 0.99513, saving model to west_hdfs_transformer.hdf5
Epoch 3/20
Epoch 3: val_accuracy improved from 0.99513 to 0.99824, saving model to west_hdfs_transformer.hdf5
Epoch 4/20
Epoch 4: val_accuracy improved from 0.99824 to 0.99913, saving model to west_hdfs_transformer.hdf5
Epoch 5/20
Epoch 5: val_accuracy improved from 0.99913 to 0.99954, saving model to west_hdfs_transformer.hdf5
Epoch 6/20
Epoch 6: val_accuracy improved from 0.99954 to 0.99970, saving model to west_hdfs_transformer.hdf5
Epoch 7/20