In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import keras as ks
import transformers as optimus
from keras.utils import to_categorical
from sklearn.metrics import classification_report
from keras.layers import Input, Dropout, InputSpec, Conv1D, Add
from sklearn.preprocessing import OneHotEncoder
import keras.backend as K
import torch
import torch.nn as nn
tf.config.run_functions_eagerly(True)




In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

def sparse_categorical_crossentropy(y_true, y_pred):
    loss = ks.losses.categorical_crossentropy(y_true, y_pred)
    return tf.reduce_mean(loss)

In [3]:
py_any = K.any
ndim = K.ndim
bert_path = "uncased_L-12_H-768_A-12"
bert_ckpt_file = os.path.join(bert_path, 'bert_model.ckpt')
bert_config_file = os.path.join(bert_path, 'bert_config.json')
vocab_file = os.path.join(bert_path, 'vocab.txt')
class IntentDetectionData:
  DATA_COLUMN = "text"
  LABEL_COLUMN = "intent"

  def __init__(self, train, test, tokenizer: optimus.BertTokenizer, classes, max_seq_len=192):
    self.tokenizer = tokenizer
    self.max_seq_len = 0
    self.classes = classes
    
    train, test = map(lambda df: df.reindex(df[IntentDetectionData.DATA_COLUMN].str.len().sort_values().index), [train, test])
    
    ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

    print("max seq_len", self.max_seq_len)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

  def _prepare(self, df):
    x, y = [], []
    
    for _, row in tqdm(df.iterrows()):
      text, label = row[IntentDetectionData.DATA_COLUMN], row[IntentDetectionData.LABEL_COLUMN]
      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(self.classes.index(label))

    return np.asarray(x, dtype = "object"), np.array(y, dtype = "object")

  def _pad(self, ids):
    x = []
    for input_ids in ids:
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.asarray(input_ids, dtype = "object"))
    return np.asarray(x, dtype = "object")
def batch_dot(x, y, axes=None):
    if isinstance(axes, int):
        axes = (axes, axes)
    x_ndim = ndim(x)
    y_ndim = ndim(y)
    if axes is None:
        axes = [x_ndim - 1, y_ndim - 2]
    if py_any([isinstance(a, (list, tuple)) for a in axes]):
        raise ValueError('Multiple target dimensions are not supported. ' +
                         'Expected: None, int, (int, int), ' +
                         'Provided: ' + str(axes))
    if x_ndim > y_ndim:
        diff = x_ndim - y_ndim
        y = tf.reshape(y, tf.concat([tf.shape(y), [1] * (diff)], axis=0))
    elif y_ndim > x_ndim:
        diff = y_ndim - x_ndim
        x = tf.reshape(x, tf.concat([tf.shape(x), [1] * (diff)], axis=0))
    else:
        diff = 0
    if ndim(x) == 2 and ndim(y) == 2:
        if axes[0] == axes[1]:
            out = tf.reduce_sum(tf.multiply(x, y), axes[0])
        else:
            out = tf.reduce_sum(tf.multiply(tf.transpose(x, [1, 0]), y), axes[1])
    else:
        if axes is not None:
            adj_x = None if axes[0] == ndim(x) - 1 else True
            adj_y = True if axes[1] == ndim(y) - 1 else None
        else:
            adj_x = None
            adj_y = None
        out = tf.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
    if diff:
        if x_ndim > y_ndim:
            idx = x_ndim + y_ndim - 3
        else:
            idx = x_ndim - 1
        out = tf.squeeze(out, list(range(idx, idx + diff)))
    if ndim(out) == 1:
        out = tf.expand_dims(out, 1)
    return out
def shape_list(x):
    if K.backend() != 'theano':
        tmp = K.int_shape(x)
    else:
        tmp = x.shape
    tmp = list(tmp)
    tmp[0] = -1
    return tmp


def split_heads(x, n: int, k: bool = False): 
    x_shape = shape_list(x)
    m = x_shape[-1]
    new_x_shape = x_shape[:-1] + [n, m // n]
    new_x = K.reshape(x, new_x_shape)
    return K.permute_dimensions(new_x, [0, 2, 3, 1] if k else [0, 2, 1, 3])


def merge_heads(x):
    new_x = K.permute_dimensions(x, [0, 2, 1, 3])
    x_shape = shape_list(new_x)
    new_x_shape = x_shape[:-2] + [np.prod(x_shape[-2:])]
    return K.reshape(new_x, new_x_shape)


def scaled_dot_product_attention_tf(q, k, v, attn_mask, attention_dropout: float, neg_inf: float):
    w = batch_dot(q, k)  
    w = w / K.sqrt(K.cast(shape_list(v)[-1], K.floatx()))
    if attn_mask is not None:
        w = attn_mask * w + (1.0 - attn_mask) * neg_inf
    w = K.softmax(w)
    w = Dropout(attention_dropout)(w)
    return batch_dot(w, v)  


def scaled_dot_product_attention_th(q, k, v, attn_mask, attention_dropout: float, neg_inf: float):
    w = theano_matmul(q, k)
    w = w / K.sqrt(K.cast(shape_list(v)[-1], K.floatx()))
    if attn_mask is not None:
        attn_mask = K.repeat_elements(attn_mask, shape_list(v)[1], 1)
        w = attn_mask * w + (1.0 - attn_mask) * neg_inf
    w = K.T.exp(w - w.max()) / K.T.exp(w - w.max()).sum(axis=-1, keepdims=True)
    w = Dropout(attention_dropout)(w)
    return theano_matmul(w, v)


def multihead_attention(x, attn_mask, n_head: int, n_state: int, attention_dropout: float, neg_inf: float):
    _q, _k, _v = x[:, :, :n_state], x[:, :, n_state:2 * n_state], x[:, :, -n_state:]
    q = split_heads(_q, n_head)  
    k = split_heads(_k, n_head, k=True)  
    v = split_heads(_v, n_head)  
    if K.backend() == 'tensorflow':
        a = scaled_dot_product_attention_tf(q, k, v, attn_mask, attention_dropout, neg_inf)
    else:
        a = scaled_dot_product_attention_th(q, k, v, attn_mask, attention_dropout, neg_inf)
    return merge_heads(a)


def gelu(x):
    return 0.5 * x * (1 + K.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * K.pow(x, 3))))

def theano_matmul(a, b, _left=False):
    assert a.ndim == b.ndim
    ndim = a.ndim
    assert ndim >= 2
    if _left:
        b, a = a, b
    if ndim == 2:
        return K.T.dot(a, b)
    else:
        if a.broadcastable[0] and not b.broadcastable[0]:
            output, _ = K.theano.scan(theano_matmul, sequences=[b], non_sequences=[a[0], 1])
        elif b.broadcastable[0] and not a.broadcastable[0]:
            output, _ = K.theano.scan(theano_matmul, sequences=[a], non_sequences=[b[0]])
        else:
            output, _ = K.theano.scan(theano_matmul, sequences=[a, b])
        return output
class MultiHeadAttention(ks.Layer):
    def __init__(self, n_head: int, n_state: int, attention_dropout: float, use_attn_mask: bool, neg_inf: float,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self.n_head = n_head
        self.n_state = n_state
        self.attention_dropout = attention_dropout
        self.use_attn_mask = use_attn_mask
        self.neg_inf = neg_inf

    def compute_output_shape(self, input_shape):
        x = input_shape[0] if self.use_attn_mask else input_shape
        return x[0], x[1], x[2] // 3

    def call(self, inputs, **kwargs):
        x = inputs[0] if self.use_attn_mask else inputs
        attn_mask = inputs[1] if self.use_attn_mask else None
        return multihead_attention(x, attn_mask, self.n_head, self.n_state, self.attention_dropout, self.neg_inf)

    def get_config(self):
        config = {
            'n_head': self.n_head,
            'n_state': self.n_state,
            'attention_dropout': self.attention_dropout,
            'use_attn_mask': self.use_attn_mask,
            'neg_inf': self.neg_inf,
        }
        base_config = super().get_config()
        return dict(list(base_config.items()) + list(config.items()))


class LayerNormalization(ks.Layer):
    def __init__(self, eps: float = 1e-5, **kwargs) -> None:
        self.eps = eps
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:], initializer=Ones(), trainable=True)
        self.beta = self.add_weight(name='beta', shape=input_shape[-1:], initializer=Zeros(), trainable=True)
        super().build(input_shape)

    def call(self, x, **kwargs):
        u = K.mean(x, axis=-1, keepdims=True)
        s = K.mean(K.square(x - u), axis=-1, keepdims=True)
        z = (x - u) / K.sqrt(s + self.eps)
        return self.gamma * z + self.beta

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_config(self):
        config = {
            'eps': self.eps,
        }
        base_config = super().get_config()
        return dict(list(base_config.items()) + list(config.items()))


class Gelu(ks.Layer):
    def __init__(self, accurate: bool = False, **kwargs):
        super().__init__(**kwargs)
        self.accurate = accurate

    def call(self, inputs, **kwargs):
        if not self.accurate:
            return gelu(inputs)
        if K.backend() == 'tensorflow':
            erf = K.tf.erf
        else:
            erf = K.T.erf
        return inputs * 0.5 * (1.0 + erf(inputs / np.sqrt(2.0)))

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_config(self):
        config = {
            'accurate': self.accurate,
        }
        base_config = super().get_config()
        return dict(list(base_config.items()) + list(config.items()))
def _get_pos_encoding_matrix(max_len: int, d_emb: int) -> np.array:
    pos_enc = np.array(
        [[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)] if pos != 0 else np.zeros(d_emb) for pos in
         range(max_len)], dtype=np.float32)
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc


class BertEmbedding(ks.layers.Layer):
    def __init__(self, output_dim: int = 768, dropout: float = 0.1, vocab_size: int = 30000,
                 max_len: int = 512, trainable_pos_embedding: bool = True, use_one_dropout: bool = False,
                 use_embedding_layer_norm: bool = False, layer_norm_epsilon: float = 1e-5, **kwargs):
        super().__init__(**kwargs)
        self.max_len = max_len
        self.use_one_dropout = use_one_dropout
        self.output_dim = output_dim
        self.dropout = dropout
        self.vocab_size = vocab_size

        # Bert ks uses two segments for next-sentence classification task
        self.segment_emb = ks.layers.Embedding(2, output_dim, input_length=max_len,
                                                  name='SegmentEmbedding')

        self.trainable_pos_embedding = trainable_pos_embedding
        if not trainable_pos_embedding:
            self.pos_emb = ks.layers.Embedding(max_len, output_dim, trainable=False, input_length=max_len,
                                                  name='PositionEmbedding',
                                                  weights=[_get_pos_encoding_matrix(max_len, output_dim)])
        else:
            self.pos_emb = ks.layers.Embedding(max_len, output_dim, input_length=max_len, name='PositionEmbedding')

        self.token_emb = ks.layers.Embedding(vocab_size, output_dim, input_length=max_len, name='TokenEmbedding')
        self.embedding_dropout = ks.layers.Dropout(dropout, name='EmbeddingDropOut')
        self.add_embeddings = ks.layers.Add(name='AddEmbeddings')
        self.use_embedding_layer_norm = use_embedding_layer_norm
        if self.use_embedding_layer_norm:
            self.embedding_layer_norm = LayerNormalization(layer_norm_epsilon)
        else:
            self.embedding_layer_norm = None
        self.layer_norm_epsilon = layer_norm_epsilon

    def compute_output_shape(self, input_shape):
        return input_shape[0][0], input_shape[0][1], self.output_dim

    def get_config(self):
        config = {
            'max_len': self.max_len,
            'use_one_dropout': self.use_one_dropout,
            'output_dim': self.output_dim,
            'dropout': self.dropout,
            'vocab_size': self.vocab_size,
            'trainable_pos_embedding': self.trainable_pos_embedding,
            'embedding_layer_norm': self.use_embedding_layer_norm,
            'layer_norm_epsilon': self.layer_norm_epsilon
        }
        base_config = super().get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def __call__(self, inputs, **kwargs):
        tokens, segment_ids, pos_ids = inputs
        segment_embedding = self.segment_emb(segment_ids)
        pos_embedding = self.pos_emb(pos_ids)
        token_embedding = self.token_emb(tokens)
        if self.use_one_dropout:
            summation = self.add_embeddings([segment_embedding, pos_embedding, token_embedding])
            if self.embedding_layer_norm:
                summation = self.embedding_layer_norm(summation)
            return self.embedding_dropout(summation)
        summation = self.add_embeddings(
            [self.embedding_dropout(segment_embedding), self.embedding_dropout(pos_embedding),
             self.embedding_dropout(token_embedding)])
        if self.embedding_layer_norm:
            summation = self.embedding_layer_norm(summation)
        return summation
class MultiHeadSelfAttention:
    def __init__(self, n_state: int, n_head: int, attention_dropout: float,
                 use_attn_mask: bool, layer_id: int, neg_inf: float) -> None:
        assert n_state % n_head == 0
        self.c_attn = Conv1D(3 * n_state, 1, name='layer_{}/c_attn'.format(layer_id))
        self.attn = MultiHeadAttention(n_head, n_state, attention_dropout, use_attn_mask,
                                       neg_inf, name='layer_{}/self_attention'.format(layer_id))
        self.c_attn_proj = Conv1D(n_state, 1, name='layer_{}/c_attn_proj'.format(layer_id))

    def __call__(self, x, mask):
        output = self.c_attn(x)
        output = self.attn(output) if mask is None else self.attn([output, mask])
        return self.c_attn_proj(output)


class PositionWiseFF:
    def __init__(self, n_state: int, d_hid: int, layer_id: int, accurate_gelu: bool) -> None:
        self.c_fc = Conv1D(d_hid, 1, name='layer_{}/c_fc'.format(layer_id))
        self.activation = Gelu(accurate=accurate_gelu, name='layer_{}/gelu'.format(layer_id))
        self.c_ffn_proj = Conv1D(n_state, 1, name='layer_{}/c_ffn_proj'.format(layer_id))

    def __call__(self, x):
        output = self.activation(self.c_fc(x))
        return self.c_ffn_proj(output)


class EncoderLayer:
    def __init__(self, n_state: int, n_head: int, d_hid: int, residual_dropout: float, attention_dropout: float,
                 use_attn_mask: bool, layer_id: int, neg_inf: float, ln_epsilon: float, accurate_gelu: bool) -> None:
        self.Inputspec = InputSpec(ndim=3)
        self.attention = MultiHeadSelfAttention(n_state, n_head, attention_dropout, use_attn_mask, layer_id, neg_inf)
        self.drop1 = Dropout(residual_dropout, name='layer_{}/ln_1_drop'.format(layer_id))
        self.add1 = Add(name='layer_{}/ln_1_add'.format(layer_id))
        self.ln1 = LayerNormalization(ln_epsilon, name='layer_{}/ln_1'.format(layer_id))
        self.ffn = PositionWiseFF(n_state, d_hid, layer_id, accurate_gelu)
        self.drop2 = Dropout(residual_dropout, name='layer_{}/ln_2_drop'.format(layer_id))
        self.add2 = Add(name='layer_{}/ln_2_add'.format(layer_id))
        self.ln2 = LayerNormalization(ln_epsilon, name='layer_{}/ln_2'.format(layer_id))
        print("problem 1")

    def __call__(self, x, mask):
        print("problem 2")
        print(x)
        a = self.attention(x, mask)
        print("problem 3")
        n = self.ln1(self.add1([x, self.drop1(a)]))
        print("problem 4")
        f = self.ffn(n)
        return self.ln2(self.add2([n, self.drop2(f)]))


def create_transformer(embedding_dim: int = 768, embedding_dropout: float = 0.1, vocab_size: int = 30000,
                       max_len: int = 512, trainable_pos_embedding: bool = True, num_heads: int = 12,
                       num_layers: int = 12, attention_dropout: float = 0.1, use_one_embedding_dropout: bool = False,
                       d_hid: int = 768 * 4, residual_dropout: float = 0.1, use_attn_mask: bool = True,
                       embedding_layer_norm: bool = False, neg_inf: float = -1e9, layer_norm_epsilon: float = 1e-5,
                       accurate_gelu: bool = False) -> ks.Model:
    tokens = Input(batch_shape=(None, max_len), name='token_input', dtype='int32')
    segment_ids = Input(batch_shape=(None, max_len), name='segment_input', dtype='int32')
    pos_ids = Input(batch_shape=(None, max_len), name='position_input', dtype='int32')
    attn_mask = Input(batch_shape=(None, 1, max_len, max_len), name='attention_mask_input',
                      dtype=K.floatx()) if use_attn_mask else None
    inputs = [tokens, segment_ids, pos_ids]
    embedding_layer = BertEmbedding(embedding_dim, embedding_dropout, vocab_size, max_len, trainable_pos_embedding,
                                    use_one_embedding_dropout, embedding_layer_norm, layer_norm_epsilon)
    x = embedding_layer(inputs)
    for i in range(num_layers):
        x = EncoderLayer(embedding_dim, num_heads, d_hid, residual_dropout,
                         attention_dropout, use_attn_mask, i, neg_inf, layer_norm_epsilon, accurate_gelu)(x, attn_mask)
    if use_attn_mask:
        inputs.append(attn_mask)
    return ks.Model(inputs=inputs, outputs=[x], name='Transformer')
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import copy
import json
import six
import tensorflow as tf


class BertConfig(object):

  def __init__(self,
               vocab_size,
               hidden_size=768,
               num_hidden_layers=12,
               num_attention_heads=12,
               intermediate_size=3072,
               hidden_act="gelu",
               hidden_dropout_prob=0.1,
               attention_probs_dropout_prob=0.1,
               max_position_embeddings=512,
               type_vocab_size=16,
               initializer_range=0.02):
    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range

  @classmethod
  def from_dict(cls, json_object):
    config = BertConfig(vocab_size=None)
    for (key, value) in six.iteritems(json_object):
      config.__dict__[key] = value
    return config

  @classmethod
  def from_json_file(cls, json_file):
    with tf.io.gfile.GFile(json_file, "r") as reader:
      text = reader.read()
    return cls.from_dict(json.loads(text))

  def to_dict(self):
    output = copy.deepcopy(self.__dict__)
    return output

  def to_json_string(self):
    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
def get_bert_weights_for_keras_model(check_point, max_len, model, tf_var_names):
    keras_weights = [np.zeros(w.shape) for w in model.weights]
    keras_weights_set = []

    for var_name, _ in tf_var_names:
        qkv, unsqueeze, w_id = _get_tf2keras_weights_name_mapping(var_name)
        if w_id is None:
            print('not mapped: ', var_name)  # TODO pooler, cls/predictions, cls/seq_relationship
        else:
            print(var_name, ' -> ', model.weights[w_id].name)
            keras_weights_set.append(w_id)
            keras_weight = keras_weights[w_id]
            tensorflow_weight = check_point.get_tensor(var_name)
            keras_weights[w_id] = _set_keras_weight_from_tf_weight(max_len, tensorflow_weight, keras_weight, qkv, unsqueeze, w_id)

    keras_layer_not_set = set(list(range(len(keras_weights)))) - set(keras_weights_set)
    assert len(keras_layer_not_set) == 0, 'Some weights were not set!'

    return keras_weights


def _set_keras_weight_from_tf_weight(max_len, tensorflow_weight, keras_weight, qkv, unsqueeze, w_id):
    if qkv is None:
        if w_id == 1:  # pos embedding
            keras_weight[:max_len, :] = tensorflow_weight[:max_len, :] if not unsqueeze else tensorflow_weight[None, :max_len, :]

        elif w_id == 2:  # word embedding
            keras_weight = tensorflow_weight
        else:
            keras_weight[:] = tensorflow_weight if not unsqueeze else tensorflow_weight[None, ...]
    else:
        p = {'q': 0, 'k': 1, 'v': 2}[qkv]
        if keras_weight.ndim == 3:
            dim_size = keras_weight.shape[1]
            keras_weight[0, :, p * dim_size:(p + 1) * dim_size] = tensorflow_weight if not unsqueeze else tensorflow_weight[None, ...]
        else:
            dim_size = keras_weight.shape[0] // 3
            keras_weight[p * dim_size:(p + 1) * dim_size] = tensorflow_weight

    return keras_weight


def _get_tf2keras_weights_name_mapping(var_name):
    w_id = None
    qkv = None
    unsqueeze = False

    var_name_splitted = var_name.split('/')
    if var_name_splitted[1] == 'embeddings':
        w_id = _get_embeddings_name(var_name_splitted)

    elif var_name_splitted[2].startswith('layer_'):
        qkv, unsqueeze, w_id = _get_layers_name(var_name_splitted)

    return qkv, unsqueeze, w_id


def _get_layers_name(var_name_splitted):
    first_vars_size = 5
    w_id = None
    qkv = None
    unsqueeze = False

    layer_number = int(var_name_splitted[2][len('layer_'):])
    if var_name_splitted[3] == 'attention':
        if var_name_splitted[-1] == 'beta':
            w_id = first_vars_size + layer_number * 12 + 5
        elif var_name_splitted[-1] == 'gamma':
            w_id = first_vars_size + layer_number * 12 + 4
        elif var_name_splitted[-2] == 'dense':
            if var_name_splitted[-1] == 'bias':
                w_id = first_vars_size + layer_number * 12 + 3
            elif var_name_splitted[-1] == 'kernel':
                w_id = first_vars_size + layer_number * 12 + 2
                unsqueeze = True
            else:
                raise ValueError()
        elif var_name_splitted[-2] == 'key' or var_name_splitted[-2] == 'query' or var_name_splitted[-2] == 'value':
            w_id = first_vars_size + layer_number * 12 + (0 if var_name_splitted[-1] == 'kernel' else 1)
            unsqueeze = var_name_splitted[-1] == 'kernel'
            qkv = var_name_splitted[-2][0]
        else:
            raise ValueError()
    elif var_name_splitted[3] == 'intermediate':
        if var_name_splitted[-1] == 'bias':
            w_id = first_vars_size + layer_number * 12 + 7
        elif var_name_splitted[-1] == 'kernel':
            w_id = first_vars_size + layer_number * 12 + 6
            unsqueeze = True
        else:
            raise ValueError()
    elif var_name_splitted[3] == 'output':
        if var_name_splitted[-1] == 'beta':
            w_id = first_vars_size + layer_number * 12 + 11
        elif var_name_splitted[-1] == 'gamma':
            w_id = first_vars_size + layer_number * 12 + 10
        elif var_name_splitted[-1] == 'bias':
            w_id = first_vars_size + layer_number * 12 + 9
        elif var_name_splitted[-1] == 'kernel':
            w_id = first_vars_size + layer_number * 12 + 8
            unsqueeze = True
        else:
            raise ValueError()
    return qkv, unsqueeze, w_id


def _get_embeddings_name(parts):
    n = parts[-1]
    if n == 'token_type_embeddings':
        w_id = 0
    elif n == 'position_embeddings':
        w_id = 1
    elif n == 'word_embeddings':
        w_id = 2
    elif n == 'gamma':
        w_id = 3
    elif n == 'beta':
        w_id = 4
    else:
        raise ValueError()
    return w_id
def create_model(base_location: str = '../uncased_L-12_H-768_A-12',
                     use_attn_mask: bool = True, max_len: int = 512) -> ks.Model:
    bert_config = BertConfig.from_json_file(base_location + '/bert_config.json')
    print(bert_config.__dict__)
    init_checkpoint = base_location + '/bert_model.ckpt'
    var_names = tf.train.list_variables(init_checkpoint)
    check_point = tf.train.load_checkpoint(init_checkpoint)
    model = create_transformer(embedding_layer_norm=True, neg_inf=-10000.0, use_attn_mask=use_attn_mask,
                               vocab_size=bert_config.vocab_size, accurate_gelu=True, layer_norm_epsilon=1e-12, max_len=max_len,
                               use_one_embedding_dropout=True, d_hid=bert_config.intermediate_size,
                               embedding_dim=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers,
                               num_heads=bert_config.num_attention_heads,
                               residual_dropout=bert_config.hidden_dropout_prob,
                               attention_dropout=bert_config.attention_probs_dropout_prob)
    weights = get_bert_weights_for_keras_model(check_point, max_len, model, var_names)
    model.set_weights(weights)
    return model

AttributeError: module 'keras' has no attribute 'Layer'

In [5]:
train = pd.read_csv('./data/Book1.csv')
# valid = pd.read_csv('./data/valid.csv')
# test = pd.read_csv('./data/test.csv')

FileNotFoundError: [Errno 2] No such file or directory: './data/Book1.csv'

In [6]:
class Attention(nn.Module):

    def __init__(self, dimensions):
        super(Attention, self).__init__()
        self.dimensions = dimensions
        self.linear_out = nn.Linear(dimensions * 2, dimensions, bias=False)
        self.softmax = nn.Softmax(dim=1)
        self.tanh = nn.Tanh()

    def forward(self, query, context, attention_mask):
        attention_scores = torch.bmm(query, context.transpose(1, 2).contiguous())
        if attention_mask is not None:
            attention_mask = torch.unsqueeze(attention_mask, 2)
            attention_scores.masked_fill_(attention_mask == 0, -np.inf)
        attention_weights = self.softmax(attention_scores)
        mix = torch.bmm(attention_weights, context)
        combined = torch.cat((mix, query), dim=2)
        output = self.linear_out(combined)
        output = self.tanh(output)
        return output, attention_weights

In [7]:
slots = [
    "B-sheet Name",
    "B-Cell Reference",
    "B-Range",
    "B-Value",
    "B-Formula",
    "B-Condition",
    "B-Data Type",
    "B-Column Reference",
    "B-Font",
    "B-Alignment",
    "B-Border Style",
    "B-Background Color",
    "B-Fill Color",
    "B-Font Color",
    "B-Number Format",
    "B-Chart Type",
    "B-Chart Axis",
    "B-Chart Series",
    "B-Chart Title",
    "B-Chart Legend",
    "B-Chart Data Labels",
    "B-Chart Data Table",
    "B-Chart Data Source",
    "B-Pivot Table",
    "B-Column_name",
    "B-Row_number"
    "B-Verb",
    "B-Date",
    "B-Time",
    "B-Number",
    "I-sheet Name",
    "I-Cell Reference",
    "I-Range",
    "I-Value",
    "I-Formula",
    "I-Condition",
    "I-Data Type",
    "I-Column Reference",
    "I-Font",
    "I-Alignment",
    "I-Border Style",
    "I-Background Color",
    "I-Fill Color",
    "I-Font Color",
    "I-Number Format",
    "I-Chart Type",
    "I-Chart Axis",
    "I-Chart Series",
    "I-Chart Title",
    "I-Chart Legend",
    "I-Chart Data Labels",
    "I-Chart Data Table",
    "I-Chart Data Source",
    "I-Pivot Table",
    "I-Verb",
    "I-Date",
    "I-Time",
    "I-Number",
    "I-sheet_location",
    "I-Column_name",
    "I-Row_number"
    "O"
]

<h1>Intents available</h1>
<ul>
  {% for intent in intents %}
    <li>{{ intent }}</li>
  {% endfor %}
</ul>

<h1>Slots available</h1>
<ul>
  {% for slot in slots %}
    <li>{{ slot }}</li> 
    {% endfor %}
</ul>

In [8]:
class classifier:
    def __init__(self, 
                 model_name: str,
                 max_seq_len: int,
                 classes: list[str],
                 model_path: str = None,
                 dropout: float = 0.1,
                 tokenizer: str = None,
                 extended: bool = False,
                 slot_classes: list[str] = None,
                 multi: bool = False
                ):
        self.model_name = model_name
        self.max_seq_len = max_seq_len
        self.classes = classes
        self.model_path = model_path
        self.dropout = dropout
        self.extended = extended
        self.tokenizer = tokenizer
        self.slot_classes = slot_classes
        self.multi = multi 
        
    def build(self):
        if not self.tokenizer:
            self.tokenizer = optimus.BertTokenizer.from_pretrained(self.model_path)
        elif self.tokenizer.lower() == "bert":
            self.tokenizer = optimus.BertTokenizer.from_pretrained(self.model_path)
        elif self.tokenizer.lower() == "albert":
            self.tokenizer = optimus.AlbertTokenizer.from_pretrained()
        elif self.tokenizer.lower() == "roberta":
            self.tokenizer = optimus.RobertaTokenizer.from_pretrained()
            
        if not self.extended:
            bert = optimus.TFBertModel.from_pretrained(self.model_path)
            input_ids = ks.layers.Input(shape=(self.max_seq_len,), dtype='int32')
            reshape_layer = ks.layers.Lambda(lambda x: x[:,0,:])
            dropout_layer_1 = ks.layers.Dropout(self.dropout)
            hidden_out = ks.layers.Dense(768, activation='softmax', kernel_initializer="random_normal")
            dropout_layer_2 = ks.layers.Dropout(self.dropout)
            class_out = ks.layers.Dense(len(self.classes), activation='softmax', kernel_initializer="random_normal")
            # class_reshape = ks.layers.Reshape((7,))
            bert_out = bert(input_ids)
            reshape_lambda = reshape_layer(bert_out[0])
            dropout_1 = dropout_layer_1(reshape_lambda)
            hidden_out = hidden_out(dropout_1)
            dropout_2 = dropout_layer_2(hidden_out)
            class_out = class_out(dropout_2)
            # class_out = class_reshape(class_out)
            self.model = ks.models.Model(inputs = input_ids, outputs = class_out)
            self.model.build(self.max_seq_len)
            print(self.model.output_shape)
        else: 
            
            bert = optimus.BertModel.from_pretrained(self.model_name)
            input_ids = ks.layers.Input(shape=(self.max_seq_len,), dtype='int32')
            reshape_layer = ks.layers.Lambda(lambda x: x[:,0,:])
            dropout_layer_1 = ks.layers.Dropout(self.dropout)
            hidden_out = ks.layers.Dense(768, activation='softmax', kernel_initializer="random_normal")
            dropout_layer_2 = ks.layers.Dropout(self.dropout)
            intent_classifier = ks.layers.Dense(len(self.classes), activation='softmax', kernel_initializer="random_normal")
            slot_classifier = ks.layers.Dense(len(self.slot_classes), activation='softmax', kernel_initializer="random_normal")
            
            bert_out = bert(input_ids)
            reshape_lambda = reshape_layer(bert_out[0])
            dropout_1 = dropout_layer_1(reshape_lambda)
            hidden_out = hidden_out(dropout_1)
            dropout_2 = dropout_layer_2(hidden_out)
            intent_out = intent_classifier(dropout_2)
            slot_out = slot_classifier(class_out)
            self.model = ks.models.Model(inputs = input_ids, outputs = [intent_out, slot_out])
            self.model.build(self.max_seq_len)
            
            
    
        
    def model_metrics(self, optimizer: str = "adam" , learning_rate: float = 1e-5, loss: str = "sparse" , metrics: list[str] = ["accuracy"]):
        if optimizer.lower() == "adam":
            optimizer = ks.optimizers.Adam(learning_rate=learning_rate)
        elif optimizer.lower() == "sgd":
            optimizer = ks.optimizers.SGD(learning_rate=learning_rate)
        else:
            optimizer = ks.optimizers.RMSprop(learning_rate=learning_rate)
            
        if loss.lower() == "categories":
            loss = ks.losses.CategoricalCrossentropy(from_logits = False)
        elif loss.lower() == "binary":
            loss = ks.losses.BinaryCrossentropy(from_logits = False)
        elif loss.lower() == "sparse":
            loss = ks.losses.SparseCategoricalCrossentropy(from_logits = False)
        
        for metric in metrics:
            if metric.lower() == "accuracy":
                metric = ks.metrics.Accuracy(name = "accuracy")
            elif metric.lower() == "precision":
                metric = ks.metrics.Precision(name = "precision")
            elif metric.lower() == "recall":
                metric = ks.metrics.Recall(name = "recall")
            elif metric.lower() == "f1":
                metric = ks.metrics.F1Score(name = "f1")
            elif metric.lower() == "mse":
                metrics = ks.metrics.MeanSquaredError(name = "mse")
            elif metric.lower() == "mae":
                metric = ks.metrics.MeanAbsoluteError(name = "mae")
            elif metric.lower() == "sparse":
                metrics = ks.metrics.SparseCategoricalAccuracy(name = "sparse")
        if not self.extended:
            self.optim = optimizer
            self.loss = loss
            self.metrics = metrics
            self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
        else:
            self.optim = optimizer
            self.loss = loss
            self.metrics = metrics
            self.model.compile(
                optimizer = optimizer,
                loss={'intent_output': loss,
                      'slot_output': loss},
                metrics={'intent_out': metrics,
                         'slot_out': metrics})
            self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
                   
    def model_summary(self):
        self.model.summary()
        
    def encode_input(self, data: np.ndarray, labels: np.ndarray):
        encoded_input = []
        encoded_output = []
        for utterance in tqdm(data):
            utterance = self.tokenizer.tokenize(utterance)
            utterance = ["<CLS>"] + utterance + ["<SEP>"]
            token_ids = self.tokenizer.convert_tokens_to_ids(utterance)
            if len(token_ids) < self.max_seq_len:
                padding = np.zeros((self.max_seq_len - len(token_ids),))
                token_ids = np.concatenate((token_ids, padding), axis=0)
            elif len(token_ids) > self.max_seq_len:
                token_ids = token_ids[:self.max_seq_len]
            encoded_input.append(token_ids)
        for label in labels:
            encoded_output.append(np.array(to_categorical(self.classes.index(label) , len(self.classes), dtype = "int32")))
            
        encoded_input = np.array(encoded_input)   
        encoded_output = np.array(encoded_output)
        # for output in encoded_output:
        #     output = np.array([output])
        #     print(output.shape)
        return encoded_input, encoded_output
            
    def train(self, train_data: pd.DataFrame, epochs: int = 10, batch_size: int = 32, verbose: int = 1, custom: bool = True):
        train_data = train_data.dropna()
        x_train = train_data['prompt'].to_numpy()
        y_train = train_data['intent'].to_numpy()
        x_train, y_train = self.encode_input(x_train, y_train)
        # y_train = OneHotEncoder(sparse=False).fit_transform(y_train.reshape(0, 1))
        if not self.extended:
            if not custom:
                y_train = np.expand_dims(y_train, axis=1)
                self.model.fit(x_train, y_train.T, epochs = epochs, batch_size = batch_size, verbose = verbose)
            else:
                # y_train = np.expand_dims(y_train, axis=1)
                train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
                train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
                for epoch in range(epochs):
                    print(f"\nStart of epoch {epoch}")
                    for step, (x_batch_train, y_batch_train) in tqdm(enumerate(train_dataset)):
                        with tf.GradientTape() as tape:
                            logits = self.model(x_batch_train, training=True)
                            loss_value = sparse_categorical_crossentropy(y_batch_train, logits)
                        grads = tape.gradient(loss_value, self.model.trainable_weights)
                        self.optim.apply_gradients(zip(grads, self.model.trainable_variables))

                        # Log every 100 batches.
                        if step % 100 == 0:
                            print(
                                f"Training loss (for 1 batch) at step {step}: {float(loss_value):.4f}"
                            )
                            print(f"Seen so far: {(step + 1) * batch_size} samples")    
        else:
            if not custom:
                y_train = np.expand_dims(y_train, axis=1)
                self.model.fit(x_train, y_train, epochs = epochs, batch_size = batch_size, verbose = verbose)
            else:
                # y_train = np.expand_dims(y_train, axis=1)
                train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
                train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

                for epoch in range(epochs):
                    print(f"\nStart of epoch {epoch}")
                    # Separate losses for each output (assuming categorical labels)
                    total_intent_loss = 0
                    total_slot_loss = 0
                    num_batches = 0
                    for step, (x_batch_train, (y_intent_train, y_slot_train)) in tqdm(enumerate(train_dataset)):
                        with tf.GradientTape() as tape:
                            logits = self.model(x_batch_train, training=True)
                            intent_output, slot_output = logits  
                            intent_loss = self.loss(y_intent_train, intent_output)
                            slot_loss = self.loss(y_slot_train, slot_output)
                            loss_value = intent_loss + slot_loss 
                        grads = tape.gradient(loss_value, self.model.trainable_weights)
                        self.optim.apply_gradients(zip(grads, self.model.trainable_variables))
                        total_intent_loss += float(intent_loss)
                        total_slot_loss += float(slot_loss)
                        num_batches += 1
                        if step % 100 == 0:
                            avg_intent_loss = total_intent_loss / num_batches
                            avg_slot_loss = total_slot_loss / num_batches
                            print(f"Training loss (for 1 batch) at step {step}:")
                            print(f"  - Intent Loss: {avg_intent_loss:.4f}")
                            print(f"  - Slot Loss: {avg_slot_loss:.4f}")
                            print(f"Seen so far: {(step + 1) * batch_size} samples")
                            total_intent_loss = 0
                            total_slot_loss = 0
                            num_batches = 0
                        
    def predict(self, utterances: str):
        utterance_intent_pairs = []
        tokens = map(self.tokenizer.tokenize, utterances)
        tokens = map(lambda x: ["[CLS]"] + x + ["[SEP]"], tokens)
        token_ids = list(map(self.tokenizer.convert_tokens_to_ids, tokens))
        token_ids = map(lambda tids: tids + [0] * (self.max_seq_len - len(tids)), token_ids)
        token_ids = np.asarray(list(token_ids), dtype = "int32")
        predictions = self.model.predict(token_ids).argmax(axis= -1)
        for utterance, label in zip(utterances, predictions):
            utterance_intent_pairs.append((utterance, self.classes[label]))
        return utterance_intent_pairs
    
    
        
    def save_model(self, model_name: str):
        self.model_name = model_name
        self.model.save(self.model_name)
        
    def load_model(self, model_name: str):
        self.model_name = model_name
        self.model = ks.models.load_model(self.model_name)
        
    def evaluate(self, test_data: pd.DataFrame, batch_size: int = 32, verbose: int = 1):
        test_data = test_data.dropna()
        test_x, test_y = self.encode_input(test_data['text'].to_numpy(), test_data['intent'].to_numpy())
        y_pred = self.model.predict(test_x, batch_size = batch_size, verbose = verbose)
        return classification_report(test_y, y_pred)
    
        
        
    

In [15]:
classes = train.intent.unique().tolist()
len(classes)

93

In [9]:
intent_classifier_model = classifier("bert", 38, classes, "bert-base-uncased", 0.1, None, False)

NameError: name 'classes' is not defined

In [17]:
intent_classifier_model.build()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

(None, 93)


In [18]:
intent_classifier_model.model_summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 38)]              0         
                                                                 
 tf_bert_model_1 (TFBertMod  TFBaseModelOutputWithPo   109482240 
 el)                         olingAndCrossAttentions             
                             (last_hidden_state=(Non             
                             e, 38, 768),                        
                              pooler_output=(None, 7             
                             68),                                
                              past_key_values=None,              
                             hidden_states=None, att             
                             entions=None, cross_att             
                             entions=None)                       
                                                           

In [21]:
intent_classifier_model.model_metrics()

In [22]:
intent_classifier_model.predict(["book a plane"])



[('book a plane', 'Create Pivot Table')]

In [23]:
intent_classifier_model.train(train, epochs = 20, batch_size = 8, verbose = 1)

100%|██████████| 9699/9699 [00:02<00:00, 3623.89it/s]



Start of epoch 0


0it [00:00, ?it/s]



1it [00:04,  4.53s/it]

Training loss (for 1 batch) at step 0: 4.5317
Seen so far: 8 samples


2it [00:07,  3.55s/it]



3it [00:10,  3.19s/it]



4it [00:12,  3.02s/it]



5it [00:15,  2.94s/it]



6it [00:18,  2.90s/it]



7it [00:21,  2.84s/it]



8it [00:24,  2.83s/it]



9it [00:26,  2.83s/it]



10it [00:29,  2.89s/it]



11it [00:32,  2.91s/it]



12it [00:35,  2.94s/it]



13it [00:38,  2.93s/it]



14it [00:41,  2.90s/it]



15it [00:44,  2.86s/it]



16it [00:47,  2.82s/it]



17it [00:49,  2.79s/it]



18it [00:52,  2.77s/it]



19it [00:55,  2.82s/it]



20it [00:58,  2.86s/it]



21it [01:01,  2.82s/it]



22it [01:03,  2.78s/it]



23it [01:06,  2.83s/it]



24it [01:09,  2.81s/it]



25it [01:12,  2.87s/it]



26it [01:15,  2.90s/it]



27it [01:18,  2.86s/it]



28it [01:21,  2.82s/it]



29it [01:24,  2.90s/it]



30it [01:27,  2.97s/it]



31it [01:30,  2.92s/it]



32it [01:32,  2.87s/it]



33it [01:35,  2.86s/it]



34it [01:38,  2.90s/it]



35it [01:41,  2.89s/it]



36it [01:44,  2.88s/it]



37it [01:47,  2.87s/it]



38it [01:50,  2.90s/it]



39it [01:53,  2.89s/it]



40it [01:56,  2.93s/it]



41it [01:59,  2.93s/it]



42it [02:02,  2.98s/it]



43it [02:04,  2.95s/it]



44it [02:07,  2.93s/it]



45it [02:10,  2.98s/it]



46it [02:13,  2.92s/it]



47it [02:16,  2.86s/it]



48it [02:19,  2.92s/it]



49it [02:22,  3.00s/it]



50it [02:25,  3.04s/it]



51it [02:29,  3.08s/it]



52it [02:32,  3.11s/it]



53it [02:35,  3.16s/it]



54it [02:38,  3.11s/it]



55it [02:41,  3.02s/it]



56it [02:44,  2.97s/it]



57it [02:47,  2.93s/it]



58it [02:49,  2.90s/it]



59it [02:52,  2.94s/it]



60it [02:55,  2.96s/it]



61it [02:58,  3.01s/it]



62it [03:01,  2.98s/it]



63it [03:04,  2.91s/it]



64it [03:07,  2.90s/it]



65it [03:10,  2.86s/it]



66it [03:13,  2.83s/it]



67it [03:15,  2.83s/it]



68it [03:18,  2.86s/it]



69it [03:21,  2.94s/it]



70it [03:24,  2.91s/it]



71it [03:27,  2.90s/it]



72it [03:30,  2.89s/it]



73it [03:33,  2.90s/it]



74it [03:36,  2.91s/it]



75it [03:39,  2.91s/it]



76it [03:42,  2.87s/it]



77it [03:44,  2.86s/it]



78it [03:47,  2.84s/it]



79it [03:50,  2.83s/it]



80it [03:53,  2.83s/it]



81it [03:56,  2.83s/it]



82it [03:58,  2.82s/it]



83it [04:01,  2.81s/it]



84it [04:04,  2.81s/it]



85it [04:07,  2.85s/it]



86it [04:10,  2.83s/it]



87it [04:13,  2.81s/it]



88it [04:15,  2.81s/it]



89it [04:18,  2.80s/it]



90it [04:21,  2.79s/it]



91it [04:24,  2.78s/it]



92it [04:26,  2.78s/it]



93it [04:29,  2.78s/it]



94it [04:32,  2.77s/it]



95it [04:35,  2.77s/it]



96it [04:38,  2.77s/it]



97it [04:40,  2.77s/it]



98it [04:43,  2.79s/it]



99it [04:46,  2.80s/it]



100it [04:49,  2.84s/it]



101it [04:52,  2.86s/it]

Training loss (for 1 batch) at step 100: 4.5184
Seen so far: 808 samples


102it [04:55,  2.90s/it]



103it [04:58,  2.92s/it]



104it [05:01,  2.91s/it]



105it [05:03,  2.87s/it]



106it [05:06,  2.87s/it]



107it [05:09,  2.92s/it]



108it [05:12,  2.89s/it]



109it [05:15,  2.89s/it]



110it [05:18,  2.92s/it]



111it [05:21,  2.92s/it]



112it [05:24,  2.91s/it]



113it [05:27,  2.91s/it]



114it [05:30,  2.88s/it]



115it [05:33,  2.91s/it]



116it [05:36,  2.93s/it]



117it [05:38,  2.89s/it]



118it [05:41,  2.86s/it]



119it [05:44,  2.83s/it]



120it [05:47,  2.82s/it]



121it [05:49,  2.83s/it]



122it [05:52,  2.81s/it]



123it [05:55,  2.81s/it]



124it [05:58,  2.80s/it]



125it [06:01,  2.81s/it]



126it [06:04,  2.82s/it]



127it [06:06,  2.83s/it]



128it [06:09,  2.88s/it]



129it [06:12,  2.89s/it]



130it [06:15,  2.85s/it]



131it [06:18,  2.85s/it]



132it [06:21,  2.88s/it]



133it [06:24,  2.85s/it]



134it [06:26,  2.83s/it]



135it [06:29,  2.81s/it]



136it [06:32,  2.80s/it]



137it [06:35,  2.79s/it]



138it [06:38,  2.85s/it]



139it [06:41,  2.91s/it]



140it [06:44,  2.94s/it]



141it [06:47,  2.91s/it]



142it [06:50,  2.95s/it]



143it [06:53,  2.93s/it]



144it [06:55,  2.91s/it]



145it [06:58,  2.88s/it]



146it [07:01,  2.86s/it]



147it [07:04,  2.86s/it]



148it [07:07,  2.83s/it]



149it [07:10,  2.85s/it]



150it [07:13,  2.88s/it]



151it [07:15,  2.85s/it]



152it [07:18,  2.87s/it]



153it [07:21,  2.87s/it]



154it [07:24,  2.87s/it]



155it [07:27,  2.90s/it]



156it [07:30,  2.88s/it]



157it [07:33,  2.85s/it]



158it [07:35,  2.83s/it]



159it [07:38,  2.91s/it]



160it [07:41,  2.95s/it]



161it [07:44,  2.94s/it]



162it [07:47,  2.92s/it]



163it [07:50,  2.90s/it]



164it [07:53,  2.95s/it]



165it [07:56,  2.89s/it]



166it [07:59,  3.01s/it]



167it [08:02,  2.99s/it]



168it [08:05,  2.93s/it]



169it [08:08,  2.91s/it]



170it [08:11,  2.91s/it]



171it [08:14,  2.90s/it]



172it [08:16,  2.90s/it]



173it [08:19,  2.92s/it]



174it [08:22,  2.88s/it]



175it [08:25,  2.90s/it]



176it [08:28,  2.92s/it]



177it [08:31,  2.88s/it]



178it [08:34,  2.90s/it]



179it [08:37,  2.96s/it]



180it [08:40,  2.93s/it]



181it [08:43,  2.98s/it]



182it [08:46,  2.95s/it]



183it [08:49,  2.99s/it]



184it [08:52,  2.92s/it]



185it [08:54,  2.88s/it]



186it [08:57,  2.87s/it]



187it [09:00,  2.90s/it]



188it [09:03,  2.91s/it]



189it [09:06,  2.90s/it]



190it [09:09,  2.92s/it]



191it [09:12,  2.88s/it]



192it [09:15,  2.86s/it]



193it [09:18,  2.89s/it]



194it [09:20,  2.89s/it]



195it [09:23,  2.88s/it]



196it [09:26,  2.92s/it]



197it [09:29,  2.91s/it]



198it [09:32,  2.89s/it]



199it [09:35,  2.93s/it]



200it [09:38,  2.92s/it]



201it [09:41,  2.91s/it]

Training loss (for 1 batch) at step 200: 4.5444
Seen so far: 1608 samples


202it [09:44,  2.87s/it]



203it [09:47,  2.86s/it]



204it [09:49,  2.86s/it]



205it [09:52,  2.89s/it]



206it [09:55,  2.86s/it]



207it [09:58,  2.88s/it]



208it [10:01,  2.89s/it]



209it [10:04,  2.86s/it]



210it [10:07,  2.84s/it]



211it [10:09,  2.85s/it]



212it [10:13,  2.97s/it]



213it [10:16,  3.01s/it]



214it [10:19,  3.02s/it]



215it [10:22,  3.05s/it]



216it [10:25,  3.01s/it]



217it [10:28,  2.94s/it]



218it [10:30,  2.87s/it]



219it [10:33,  2.83s/it]



220it [10:36,  2.81s/it]



221it [10:39,  2.79s/it]



222it [10:42,  2.85s/it]



223it [10:45,  2.88s/it]



224it [10:47,  2.86s/it]



225it [10:50,  2.85s/it]



226it [10:53,  2.86s/it]



227it [10:56,  2.89s/it]



228it [10:59,  2.92s/it]



229it [11:02,  2.88s/it]



230it [11:05,  2.86s/it]



231it [11:07,  2.84s/it]



232it [11:10,  2.87s/it]



233it [11:13,  2.85s/it]



234it [11:16,  2.85s/it]



235it [11:19,  2.86s/it]



236it [11:22,  2.91s/it]



237it [11:25,  2.94s/it]



238it [11:28,  2.94s/it]



239it [11:31,  2.95s/it]



240it [11:34,  2.99s/it]



241it [11:37,  3.03s/it]



242it [11:40,  3.06s/it]



243it [11:43,  3.09s/it]



244it [11:46,  3.06s/it]



245it [11:49,  3.10s/it]



246it [11:52,  3.02s/it]



247it [11:55,  3.03s/it]



248it [11:59,  3.18s/it]



249it [12:02,  3.11s/it]



250it [12:05,  3.10s/it]



251it [12:08,  3.05s/it]



252it [12:11,  2.98s/it]



253it [12:13,  2.93s/it]



254it [12:16,  2.87s/it]



255it [12:19,  2.83s/it]



256it [12:22,  2.81s/it]



257it [12:24,  2.78s/it]



258it [12:27,  2.78s/it]



259it [12:30,  2.76s/it]



260it [12:33,  2.75s/it]



261it [12:35,  2.74s/it]



262it [12:38,  2.73s/it]



263it [12:41,  2.74s/it]



264it [12:44,  2.74s/it]



265it [12:46,  2.75s/it]



266it [12:49,  2.73s/it]



267it [12:52,  2.73s/it]



268it [12:55,  2.74s/it]



269it [12:57,  2.73s/it]



270it [13:00,  2.74s/it]



271it [13:03,  2.74s/it]



272it [13:06,  2.75s/it]



273it [13:08,  2.74s/it]



274it [13:11,  2.73s/it]



275it [13:14,  2.73s/it]



276it [13:16,  2.73s/it]



277it [13:19,  2.73s/it]



278it [13:22,  2.73s/it]



279it [13:25,  2.74s/it]



280it [13:27,  2.73s/it]



281it [13:30,  2.74s/it]



282it [13:33,  2.74s/it]



283it [13:36,  2.75s/it]



284it [13:38,  2.74s/it]



285it [13:41,  2.73s/it]



286it [13:44,  2.74s/it]



287it [13:47,  2.77s/it]



288it [13:49,  2.77s/it]



289it [13:52,  2.80s/it]



290it [13:55,  2.80s/it]



291it [13:58,  2.81s/it]



292it [14:01,  2.80s/it]



293it [14:04,  2.81s/it]



294it [14:06,  2.79s/it]



295it [14:09,  2.81s/it]



296it [14:12,  2.80s/it]



297it [14:15,  2.81s/it]



298it [14:18,  2.81s/it]



299it [14:21,  2.86s/it]



300it [14:23,  2.85s/it]



301it [14:26,  2.81s/it]

Training loss (for 1 batch) at step 300: 4.5087
Seen so far: 2408 samples


302it [14:29,  2.83s/it]



303it [14:32,  2.81s/it]



304it [14:35,  2.82s/it]



305it [14:37,  2.80s/it]



306it [14:40,  2.80s/it]



307it [14:43,  2.81s/it]



308it [14:46,  2.82s/it]



309it [14:49,  2.82s/it]



310it [14:52,  2.85s/it]



311it [14:55,  2.91s/it]



312it [14:58,  2.93s/it]



313it [15:01,  2.93s/it]



314it [15:04,  2.97s/it]



315it [15:06,  2.95s/it]



316it [15:09,  2.92s/it]



317it [15:12,  2.93s/it]



318it [15:15,  2.93s/it]



319it [15:18,  2.93s/it]



320it [15:21,  2.89s/it]



321it [15:24,  2.89s/it]



322it [15:27,  2.91s/it]



323it [15:30,  2.99s/it]



324it [15:33,  2.95s/it]



325it [15:36,  2.97s/it]



326it [15:39,  3.09s/it]



327it [15:42,  3.14s/it]



328it [15:45,  3.10s/it]



329it [15:49,  3.09s/it]



330it [15:51,  3.00s/it]



331it [15:54,  2.96s/it]



332it [15:57,  3.00s/it]



333it [16:00,  3.00s/it]



334it [16:03,  2.98s/it]



335it [16:06,  3.00s/it]



336it [16:09,  3.06s/it]



337it [16:13,  3.12s/it]



338it [16:16,  3.17s/it]



339it [16:19,  3.15s/it]



340it [16:22,  3.13s/it]



341it [16:26,  3.21s/it]



342it [16:29,  3.24s/it]



343it [16:32,  3.18s/it]



344it [16:35,  3.15s/it]



345it [16:38,  3.21s/it]



346it [16:42,  3.20s/it]



347it [16:45,  3.23s/it]



348it [16:48,  3.23s/it]



349it [16:51,  3.07s/it]



350it [16:54,  3.00s/it]



351it [16:56,  2.94s/it]



352it [16:59,  2.95s/it]



353it [17:02,  2.94s/it]



354it [17:05,  2.92s/it]



355it [17:08,  2.94s/it]



356it [17:11,  2.94s/it]



357it [17:14,  2.89s/it]



358it [17:17,  2.88s/it]



359it [17:20,  2.91s/it]



360it [17:22,  2.86s/it]



361it [17:25,  2.84s/it]



362it [17:28,  2.84s/it]



363it [17:31,  2.84s/it]



364it [17:34,  2.82s/it]



365it [17:36,  2.79s/it]



366it [17:39,  2.83s/it]



367it [17:42,  2.88s/it]



368it [17:45,  2.84s/it]



369it [17:48,  2.83s/it]



370it [17:51,  2.81s/it]



371it [17:54,  2.82s/it]



372it [17:56,  2.82s/it]



373it [17:59,  2.85s/it]



374it [18:02,  2.87s/it]



375it [18:05,  2.86s/it]



376it [18:08,  2.85s/it]



377it [18:11,  2.83s/it]



378it [18:14,  2.88s/it]



379it [18:16,  2.88s/it]



380it [18:19,  2.84s/it]



381it [18:22,  2.84s/it]



382it [18:25,  2.89s/it]



383it [18:28,  2.90s/it]



384it [18:31,  2.92s/it]



385it [18:34,  2.96s/it]



386it [18:37,  2.94s/it]



387it [18:40,  2.90s/it]



388it [18:42,  2.86s/it]



389it [18:45,  2.84s/it]



390it [18:48,  2.88s/it]



391it [18:51,  2.89s/it]



392it [18:54,  2.87s/it]



393it [18:57,  2.93s/it]



394it [19:00,  2.96s/it]



395it [19:03,  2.99s/it]



396it [19:06,  2.93s/it]



397it [19:09,  2.88s/it]



398it [19:11,  2.85s/it]



399it [19:14,  2.83s/it]



400it [19:17,  2.79s/it]



401it [19:20,  2.77s/it]

Training loss (for 1 batch) at step 400: 4.5512
Seen so far: 3208 samples


402it [19:22,  2.77s/it]



403it [19:25,  2.76s/it]



404it [19:28,  2.75s/it]



405it [19:31,  2.74s/it]



406it [19:33,  2.75s/it]



407it [19:36,  2.75s/it]



408it [19:39,  2.75s/it]



409it [19:42,  2.75s/it]



410it [19:44,  2.76s/it]



411it [19:47,  2.77s/it]



412it [19:50,  2.76s/it]



413it [19:53,  2.78s/it]



414it [19:56,  2.79s/it]



415it [19:58,  2.77s/it]



416it [20:01,  2.77s/it]



417it [20:04,  2.78s/it]



418it [20:07,  2.79s/it]



419it [20:09,  2.78s/it]



420it [20:12,  2.79s/it]



421it [20:15,  2.77s/it]



422it [20:18,  2.78s/it]



423it [20:21,  2.77s/it]



424it [20:23,  2.78s/it]



425it [20:26,  2.77s/it]



426it [20:29,  2.78s/it]



427it [20:32,  2.78s/it]



428it [20:34,  2.77s/it]



429it [20:37,  2.78s/it]



430it [20:40,  2.78s/it]



431it [20:43,  2.77s/it]



432it [20:46,  2.77s/it]



433it [20:48,  2.78s/it]



434it [20:51,  2.78s/it]



435it [20:54,  2.78s/it]



436it [20:57,  2.78s/it]



437it [20:59,  2.78s/it]



438it [21:02,  2.78s/it]



439it [21:05,  2.80s/it]



440it [21:08,  2.78s/it]



441it [21:11,  2.79s/it]



442it [21:13,  2.79s/it]



443it [21:16,  2.79s/it]



444it [21:19,  2.78s/it]



445it [21:22,  2.80s/it]



446it [21:25,  2.78s/it]



447it [21:27,  2.79s/it]



448it [21:30,  2.79s/it]



449it [21:33,  2.80s/it]



450it [21:36,  2.80s/it]



451it [21:39,  2.79s/it]



452it [21:41,  2.79s/it]



453it [21:44,  2.79s/it]



454it [21:47,  2.78s/it]



455it [21:50,  2.79s/it]



456it [21:53,  2.80s/it]



457it [21:55,  2.79s/it]



458it [21:58,  2.87s/it]



459it [22:01,  2.85s/it]



460it [22:04,  2.84s/it]



461it [22:07,  2.83s/it]



462it [22:10,  2.83s/it]



463it [22:12,  2.81s/it]



464it [22:15,  2.81s/it]



465it [22:18,  2.80s/it]



466it [22:21,  2.79s/it]



467it [22:23,  2.79s/it]



468it [22:26,  2.78s/it]



469it [22:29,  2.78s/it]



470it [22:32,  2.79s/it]



471it [22:35,  2.79s/it]



472it [22:37,  2.79s/it]



473it [22:40,  2.80s/it]



474it [22:43,  2.80s/it]



475it [22:46,  2.79s/it]



476it [22:49,  2.79s/it]



477it [22:51,  2.79s/it]



478it [22:54,  2.80s/it]



479it [22:57,  2.81s/it]



480it [23:00,  2.80s/it]



481it [23:03,  2.81s/it]



482it [23:05,  2.80s/it]



483it [23:08,  2.81s/it]



484it [23:11,  2.81s/it]



485it [23:14,  2.80s/it]



486it [23:17,  2.81s/it]



487it [23:19,  2.80s/it]



488it [23:22,  2.80s/it]



489it [23:25,  2.80s/it]



490it [23:28,  2.78s/it]



491it [23:31,  2.78s/it]



492it [23:33,  2.79s/it]



493it [23:36,  2.81s/it]



494it [23:39,  2.81s/it]



495it [23:42,  2.82s/it]



496it [23:45,  2.82s/it]



497it [23:48,  2.82s/it]



498it [23:50,  2.82s/it]



499it [23:53,  2.83s/it]



500it [23:56,  2.83s/it]



501it [23:59,  2.83s/it]

Training loss (for 1 batch) at step 500: 4.4781
Seen so far: 4008 samples


502it [24:02,  2.84s/it]



503it [24:05,  2.83s/it]



504it [24:07,  2.83s/it]



505it [24:10,  2.83s/it]



506it [24:13,  2.81s/it]



507it [24:16,  2.81s/it]



508it [24:19,  2.80s/it]



509it [24:21,  2.80s/it]



510it [24:24,  2.81s/it]



511it [24:27,  2.80s/it]



512it [24:30,  2.79s/it]



513it [24:33,  2.80s/it]



514it [24:35,  2.82s/it]



515it [24:38,  2.85s/it]



516it [24:41,  2.84s/it]



517it [24:44,  2.85s/it]



518it [24:47,  2.84s/it]



519it [24:50,  2.87s/it]



520it [24:53,  2.85s/it]



521it [24:56,  2.87s/it]



522it [24:58,  2.85s/it]



523it [25:01,  2.84s/it]



524it [25:04,  2.84s/it]



525it [25:07,  2.83s/it]



526it [25:10,  2.83s/it]



527it [25:12,  2.81s/it]



528it [25:15,  2.80s/it]



529it [25:18,  2.82s/it]



530it [25:21,  2.81s/it]



531it [25:24,  2.80s/it]



532it [25:26,  2.83s/it]



533it [25:29,  2.81s/it]



534it [25:32,  2.82s/it]



535it [25:35,  2.82s/it]



536it [25:38,  2.82s/it]



537it [25:40,  2.81s/it]



538it [25:43,  2.82s/it]



539it [25:46,  2.83s/it]



540it [25:49,  2.82s/it]



541it [25:52,  2.82s/it]



542it [25:55,  2.82s/it]



543it [25:57,  2.82s/it]



544it [26:00,  2.82s/it]



545it [26:03,  2.82s/it]



546it [26:06,  2.81s/it]



547it [26:09,  2.80s/it]



548it [26:11,  2.81s/it]



549it [26:14,  2.81s/it]



550it [26:17,  2.83s/it]



551it [26:20,  2.84s/it]



552it [26:23,  2.82s/it]



553it [26:26,  2.82s/it]



554it [26:28,  2.82s/it]



555it [26:31,  2.83s/it]



556it [26:34,  2.83s/it]



557it [26:37,  2.83s/it]



558it [26:40,  2.83s/it]



559it [26:43,  2.83s/it]



560it [26:45,  2.83s/it]



561it [26:48,  2.82s/it]



562it [26:51,  2.82s/it]



563it [26:54,  2.84s/it]



564it [26:57,  2.85s/it]



565it [27:00,  2.85s/it]



566it [27:02,  2.83s/it]



567it [27:05,  2.83s/it]



568it [27:08,  2.82s/it]



569it [27:11,  2.82s/it]



570it [27:14,  2.83s/it]



571it [27:17,  2.83s/it]



572it [27:19,  2.83s/it]



573it [27:22,  2.82s/it]



574it [27:25,  2.83s/it]



575it [27:28,  2.83s/it]



576it [27:31,  2.83s/it]



577it [27:33,  2.81s/it]



578it [27:36,  2.81s/it]



579it [27:39,  2.82s/it]



580it [27:42,  2.82s/it]



581it [27:45,  2.82s/it]



582it [27:48,  2.82s/it]



583it [27:50,  2.82s/it]



584it [27:53,  2.82s/it]



585it [27:56,  2.84s/it]



586it [27:59,  2.82s/it]



587it [28:02,  2.83s/it]



588it [28:05,  2.82s/it]



589it [28:07,  2.83s/it]



590it [28:10,  2.83s/it]



591it [28:13,  2.84s/it]



592it [28:16,  2.84s/it]



593it [28:19,  2.83s/it]



594it [28:22,  2.84s/it]



595it [28:24,  2.83s/it]



596it [28:27,  2.85s/it]



597it [28:30,  2.84s/it]



598it [28:33,  2.84s/it]



599it [28:36,  2.85s/it]



600it [28:39,  2.86s/it]



601it [28:42,  2.89s/it]

Training loss (for 1 batch) at step 600: 4.5560
Seen so far: 4808 samples


602it [28:45,  2.90s/it]



603it [28:47,  2.88s/it]



604it [28:50,  2.87s/it]



605it [28:53,  2.86s/it]



606it [28:56,  2.85s/it]



607it [28:59,  2.86s/it]



608it [29:02,  2.85s/it]



609it [29:04,  2.84s/it]



610it [29:07,  2.84s/it]



611it [29:10,  2.84s/it]



612it [29:13,  2.84s/it]



613it [29:16,  2.85s/it]



614it [29:19,  2.84s/it]



615it [29:22,  2.85s/it]



616it [29:24,  2.85s/it]



617it [29:27,  2.86s/it]



618it [29:30,  2.86s/it]



619it [29:33,  2.87s/it]



620it [29:36,  2.88s/it]



621it [29:39,  2.86s/it]



622it [29:42,  2.86s/it]



623it [29:44,  2.84s/it]



624it [29:47,  2.84s/it]



625it [29:50,  2.84s/it]



626it [29:53,  2.84s/it]



627it [29:56,  2.84s/it]



628it [29:59,  2.87s/it]



629it [30:02,  2.87s/it]



630it [30:04,  2.87s/it]



631it [30:07,  2.86s/it]



632it [30:10,  2.86s/it]



633it [30:13,  2.85s/it]



634it [30:16,  2.86s/it]



635it [30:19,  2.86s/it]



636it [30:22,  2.86s/it]



637it [30:24,  2.85s/it]



638it [30:27,  2.86s/it]



639it [30:30,  2.86s/it]



640it [30:33,  2.86s/it]



641it [30:36,  2.86s/it]



642it [30:39,  2.86s/it]



643it [30:42,  2.87s/it]



644it [30:44,  2.86s/it]



645it [30:47,  2.86s/it]



646it [30:50,  2.86s/it]



647it [30:53,  2.86s/it]



648it [30:56,  2.86s/it]



649it [30:59,  2.87s/it]



650it [31:02,  2.88s/it]



651it [31:04,  2.87s/it]



652it [31:07,  2.87s/it]



653it [31:10,  2.86s/it]



654it [31:13,  2.86s/it]



655it [31:16,  2.86s/it]



656it [31:19,  2.87s/it]



657it [31:22,  2.87s/it]



658it [31:25,  2.87s/it]



659it [31:27,  2.87s/it]



660it [31:30,  2.87s/it]



661it [31:33,  2.88s/it]



662it [31:36,  2.87s/it]



663it [31:39,  2.87s/it]



664it [31:42,  2.86s/it]



665it [31:45,  2.86s/it]



666it [31:47,  2.86s/it]



667it [31:50,  2.86s/it]



668it [31:53,  2.86s/it]



669it [31:56,  2.87s/it]



670it [31:59,  2.86s/it]



671it [32:02,  2.88s/it]



672it [32:05,  2.87s/it]



673it [32:08,  2.88s/it]



674it [32:10,  2.88s/it]



675it [32:13,  2.87s/it]



676it [32:16,  2.87s/it]



677it [32:19,  2.88s/it]



678it [32:22,  2.88s/it]



679it [32:25,  2.87s/it]



680it [32:28,  2.88s/it]



681it [32:31,  2.88s/it]



682it [32:34,  2.89s/it]



683it [32:36,  2.89s/it]



684it [32:39,  2.88s/it]



685it [32:42,  2.89s/it]



686it [32:45,  2.88s/it]



687it [32:48,  2.88s/it]



688it [32:51,  2.89s/it]



689it [32:54,  2.87s/it]



690it [32:57,  2.87s/it]



691it [32:59,  2.88s/it]



692it [33:02,  2.89s/it]



693it [33:05,  2.90s/it]



694it [33:08,  2.89s/it]



695it [33:11,  2.89s/it]



696it [33:14,  2.89s/it]



697it [33:17,  2.87s/it]



698it [33:20,  2.88s/it]



699it [33:23,  2.89s/it]



700it [33:25,  2.88s/it]



701it [33:28,  2.88s/it]

Training loss (for 1 batch) at step 700: 4.5354
Seen so far: 5608 samples


702it [33:31,  2.89s/it]



703it [33:34,  2.89s/it]



704it [33:37,  2.91s/it]



705it [33:40,  2.91s/it]



706it [33:43,  2.91s/it]



707it [33:46,  2.89s/it]



708it [33:49,  2.90s/it]



709it [33:51,  2.89s/it]



710it [33:54,  2.89s/it]



711it [33:57,  2.90s/it]



712it [34:00,  2.90s/it]



713it [34:03,  2.89s/it]



714it [34:06,  2.88s/it]



715it [34:09,  2.88s/it]



716it [34:12,  2.88s/it]



717it [34:15,  2.89s/it]



718it [34:17,  2.89s/it]



719it [34:20,  2.90s/it]



720it [34:23,  2.90s/it]



721it [34:26,  2.90s/it]



722it [34:29,  2.90s/it]



723it [34:32,  2.91s/it]



724it [34:35,  2.90s/it]



725it [34:38,  2.89s/it]



726it [34:41,  2.89s/it]



727it [34:44,  2.88s/it]



728it [34:46,  2.89s/it]



729it [34:49,  2.89s/it]



730it [34:52,  2.91s/it]



731it [34:55,  2.91s/it]



732it [34:58,  2.91s/it]



733it [35:01,  2.92s/it]



734it [35:04,  2.91s/it]



735it [35:07,  2.91s/it]



736it [35:10,  2.89s/it]



737it [35:13,  2.89s/it]



738it [35:15,  2.88s/it]



739it [35:18,  2.89s/it]



740it [35:21,  2.89s/it]



741it [35:24,  2.89s/it]



742it [35:27,  2.90s/it]



743it [35:30,  2.90s/it]



744it [35:33,  2.89s/it]



745it [35:36,  2.88s/it]



746it [35:39,  2.88s/it]



747it [35:42,  2.89s/it]



748it [35:44,  2.88s/it]



749it [35:47,  2.89s/it]



750it [35:50,  2.90s/it]



751it [35:53,  2.92s/it]



752it [35:56,  2.92s/it]



753it [35:59,  2.91s/it]



754it [36:02,  2.93s/it]



755it [36:05,  2.91s/it]



756it [36:08,  2.93s/it]



757it [36:11,  2.93s/it]



758it [36:14,  2.90s/it]



759it [36:16,  2.91s/it]



760it [36:19,  2.93s/it]



761it [36:22,  2.95s/it]



762it [36:25,  2.95s/it]



763it [36:28,  2.96s/it]



764it [36:31,  2.97s/it]



765it [36:34,  2.97s/it]



766it [36:37,  2.97s/it]



767it [36:40,  2.96s/it]



768it [36:43,  2.95s/it]



769it [36:46,  2.94s/it]



770it [36:49,  2.96s/it]



771it [36:52,  2.96s/it]



772it [36:55,  2.98s/it]



773it [36:58,  2.99s/it]



774it [37:01,  2.97s/it]



775it [37:04,  3.00s/it]



776it [37:07,  2.99s/it]



777it [37:10,  2.99s/it]



778it [37:13,  2.99s/it]



779it [37:16,  2.99s/it]



780it [37:19,  2.98s/it]



781it [37:22,  2.99s/it]



782it [37:25,  2.98s/it]



783it [37:28,  2.98s/it]



784it [37:31,  3.00s/it]



785it [37:34,  3.02s/it]



786it [37:37,  3.02s/it]



787it [37:40,  3.11s/it]



788it [37:44,  3.17s/it]



789it [37:47,  3.20s/it]



790it [37:50,  3.17s/it]



791it [37:53,  3.17s/it]



792it [37:57,  3.20s/it]



793it [38:00,  3.23s/it]



794it [38:03,  3.32s/it]



795it [38:07,  3.34s/it]



796it [38:10,  3.31s/it]



797it [38:13,  3.24s/it]



798it [38:16,  3.21s/it]



799it [38:19,  3.20s/it]



800it [38:23,  3.20s/it]



801it [38:26,  3.25s/it]

Training loss (for 1 batch) at step 800: 4.4901
Seen so far: 6408 samples


802it [38:29,  3.23s/it]



803it [38:32,  3.24s/it]



804it [38:36,  3.25s/it]



805it [38:39,  3.28s/it]



806it [38:42,  3.26s/it]



807it [38:45,  3.25s/it]



808it [38:49,  3.28s/it]



809it [38:52,  3.26s/it]



810it [38:55,  3.23s/it]



811it [38:58,  3.19s/it]



812it [39:01,  3.19s/it]



813it [39:05,  3.23s/it]



814it [39:08,  3.30s/it]



815it [39:11,  3.26s/it]



816it [39:15,  3.26s/it]



817it [39:18,  3.26s/it]



818it [39:21,  3.23s/it]



819it [39:24,  3.22s/it]



820it [39:27,  3.20s/it]



821it [39:31,  3.22s/it]



822it [39:34,  3.22s/it]



823it [39:37,  3.21s/it]



824it [39:40,  3.24s/it]



825it [39:43,  3.19s/it]



826it [39:47,  3.15s/it]



827it [39:50,  3.12s/it]



828it [39:53,  3.20s/it]



829it [39:56,  3.22s/it]



830it [39:59,  3.18s/it]



831it [40:03,  3.21s/it]



832it [40:06,  3.24s/it]



833it [40:09,  3.22s/it]



834it [40:12,  3.22s/it]



835it [40:16,  3.22s/it]



836it [40:19,  3.21s/it]



837it [40:22,  3.25s/it]



838it [40:25,  3.22s/it]



839it [40:28,  3.22s/it]



840it [40:32,  3.23s/it]



841it [40:35,  3.25s/it]



842it [40:38,  3.23s/it]



843it [40:41,  3.23s/it]



844it [40:45,  3.24s/it]



845it [40:48,  3.27s/it]



846it [40:51,  3.26s/it]



847it [40:54,  3.25s/it]



848it [40:58,  3.24s/it]



849it [41:01,  3.22s/it]



850it [41:04,  3.21s/it]



851it [41:08,  3.29s/it]



852it [41:11,  3.32s/it]



853it [41:14,  3.28s/it]



854it [41:18,  3.32s/it]



855it [41:21,  3.30s/it]



856it [41:24,  3.25s/it]



857it [41:27,  3.32s/it]



858it [41:31,  3.30s/it]



859it [41:34,  3.27s/it]



860it [41:37,  3.31s/it]



861it [41:41,  3.30s/it]



862it [41:44,  3.28s/it]



863it [41:47,  3.29s/it]



864it [41:50,  3.32s/it]



865it [41:54,  3.33s/it]



866it [41:57,  3.33s/it]



867it [42:00,  3.34s/it]



868it [42:04,  3.30s/it]



869it [42:07,  3.27s/it]



870it [42:10,  3.31s/it]



871it [42:14,  3.34s/it]



872it [42:17,  3.28s/it]



873it [42:20,  3.27s/it]



874it [42:23,  3.20s/it]



875it [42:26,  3.17s/it]



876it [42:30,  3.26s/it]



877it [42:33,  3.24s/it]



878it [42:36,  3.32s/it]



879it [42:40,  3.26s/it]



880it [42:43,  3.27s/it]



881it [42:46,  3.24s/it]



882it [42:49,  3.30s/it]



883it [42:53,  3.31s/it]



884it [42:56,  3.27s/it]



885it [42:59,  3.34s/it]



886it [43:03,  3.36s/it]



887it [43:06,  3.36s/it]



888it [43:10,  3.41s/it]



889it [43:13,  3.45s/it]



890it [43:17,  3.42s/it]



891it [43:20,  3.42s/it]



892it [43:23,  3.43s/it]



893it [43:27,  3.39s/it]



894it [43:30,  3.40s/it]



895it [43:34,  3.42s/it]



896it [43:37,  3.46s/it]



897it [43:41,  3.44s/it]



898it [43:44,  3.42s/it]



899it [43:47,  3.43s/it]



900it [43:51,  3.39s/it]



901it [43:54,  3.35s/it]

Training loss (for 1 batch) at step 900: 4.5509
Seen so far: 7208 samples


902it [43:57,  3.35s/it]



903it [44:01,  3.30s/it]



904it [44:04,  3.27s/it]



905it [44:07,  3.25s/it]



906it [44:10,  3.25s/it]



907it [44:13,  3.23s/it]



908it [44:17,  3.23s/it]



909it [44:20,  3.23s/it]



910it [44:23,  3.27s/it]



911it [44:27,  3.29s/it]



912it [44:30,  3.26s/it]



913it [44:33,  3.27s/it]



914it [44:36,  3.25s/it]



915it [44:39,  3.22s/it]



916it [44:43,  3.22s/it]



917it [44:46,  3.21s/it]



918it [44:49,  3.19s/it]



919it [44:52,  3.21s/it]



920it [44:55,  3.23s/it]



921it [44:59,  3.20s/it]



922it [45:02,  3.21s/it]



923it [45:05,  3.27s/it]



924it [45:09,  3.34s/it]



925it [45:12,  3.32s/it]



926it [45:15,  3.27s/it]



927it [45:18,  3.25s/it]



928it [45:22,  3.26s/it]



929it [45:25,  3.25s/it]



930it [45:28,  3.29s/it]



931it [45:31,  3.26s/it]



932it [45:35,  3.23s/it]



933it [45:38,  3.22s/it]



934it [45:41,  3.25s/it]



935it [45:44,  3.27s/it]



936it [45:48,  3.24s/it]



937it [45:51,  3.33s/it]



938it [45:55,  3.35s/it]



939it [45:58,  3.31s/it]



940it [46:01,  3.34s/it]



941it [46:04,  3.27s/it]



942it [46:08,  3.27s/it]



943it [46:11,  3.22s/it]



944it [46:14,  3.18s/it]



945it [46:17,  3.19s/it]



946it [46:20,  3.16s/it]



947it [46:23,  3.15s/it]



948it [46:26,  3.14s/it]



949it [46:30,  3.17s/it]



950it [46:33,  3.22s/it]



951it [46:36,  3.24s/it]



952it [46:39,  3.26s/it]



953it [46:43,  3.27s/it]



954it [46:46,  3.28s/it]



955it [46:50,  3.33s/it]



956it [46:53,  3.38s/it]



957it [46:56,  3.41s/it]



958it [47:00,  3.43s/it]



959it [47:03,  3.44s/it]



960it [47:07,  3.40s/it]



961it [47:10,  3.38s/it]



962it [47:13,  3.40s/it]



963it [47:17,  3.41s/it]



964it [47:20,  3.35s/it]



965it [47:24,  3.36s/it]



966it [47:27,  3.32s/it]



967it [47:30,  3.35s/it]



968it [47:34,  3.40s/it]



969it [47:37,  3.38s/it]



970it [47:40,  3.32s/it]



971it [47:43,  3.25s/it]



972it [47:47,  3.27s/it]



973it [47:50,  3.23s/it]



974it [47:53,  3.23s/it]



975it [47:56,  3.20s/it]



976it [47:59,  3.19s/it]



977it [48:02,  3.17s/it]



978it [48:06,  3.20s/it]



979it [48:09,  3.24s/it]



980it [48:12,  3.24s/it]



981it [48:16,  3.25s/it]



982it [48:19,  3.23s/it]



983it [48:22,  3.22s/it]



984it [48:25,  3.22s/it]



985it [48:28,  3.23s/it]



986it [48:32,  3.22s/it]



987it [48:35,  3.25s/it]



988it [48:38,  3.27s/it]



989it [48:41,  3.26s/it]



990it [48:45,  3.22s/it]



991it [48:48,  3.21s/it]



992it [48:51,  3.19s/it]



993it [48:54,  3.18s/it]



994it [48:57,  3.17s/it]



995it [49:00,  3.20s/it]



996it [49:04,  3.19s/it]



997it [49:07,  3.18s/it]



998it [49:10,  3.18s/it]



999it [49:13,  3.18s/it]



1000it [49:16,  3.17s/it]



1001it [49:20,  3.19s/it]

Training loss (for 1 batch) at step 1000: 4.5194
Seen so far: 8008 samples


1002it [49:23,  3.19s/it]



1003it [49:26,  3.20s/it]



1004it [49:29,  3.19s/it]



1005it [49:32,  3.19s/it]



1006it [49:36,  3.22s/it]



1007it [49:39,  3.22s/it]



1008it [49:42,  3.21s/it]



1009it [49:45,  3.21s/it]



1010it [49:48,  3.21s/it]



1011it [49:52,  3.23s/it]



1012it [49:55,  3.23s/it]



1013it [49:58,  3.23s/it]



1014it [50:01,  3.22s/it]



1015it [50:05,  3.24s/it]



1016it [50:08,  3.25s/it]



1017it [50:11,  3.26s/it]



1018it [50:15,  3.28s/it]



1019it [50:18,  3.28s/it]



1020it [50:21,  3.27s/it]



1021it [50:24,  3.32s/it]



1022it [50:28,  3.33s/it]



1023it [50:31,  3.30s/it]



1024it [50:34,  3.32s/it]



1025it [50:38,  3.30s/it]



1026it [50:41,  3.30s/it]



1027it [50:44,  3.28s/it]



1028it [50:48,  3.28s/it]



1029it [50:51,  3.33s/it]



1030it [50:54,  3.34s/it]



1031it [50:58,  3.36s/it]



1032it [51:01,  3.37s/it]



1033it [51:04,  3.35s/it]



1034it [51:08,  3.35s/it]



1035it [51:11,  3.38s/it]



1036it [51:15,  3.37s/it]



1037it [51:18,  3.37s/it]



1038it [51:21,  3.35s/it]



1039it [51:25,  3.33s/it]



1040it [51:28,  3.36s/it]



1041it [51:31,  3.36s/it]



1042it [51:35,  3.34s/it]



1043it [51:38,  3.38s/it]



1044it [51:42,  3.39s/it]



1045it [51:45,  3.42s/it]



1046it [51:49,  3.48s/it]



1047it [51:52,  3.47s/it]



1048it [51:56,  3.52s/it]



1049it [51:59,  3.51s/it]



1050it [52:03,  3.46s/it]



1051it [52:06,  3.42s/it]



1052it [52:09,  3.42s/it]



1053it [52:13,  3.46s/it]



1054it [52:16,  3.45s/it]



1055it [52:20,  3.41s/it]



1056it [52:23,  3.42s/it]



1057it [52:27,  3.45s/it]



1058it [52:30,  3.58s/it]



1059it [52:34,  3.53s/it]



1060it [52:37,  3.49s/it]



1061it [52:41,  3.45s/it]



1062it [52:44,  3.42s/it]



1063it [52:47,  3.38s/it]



1064it [52:51,  3.38s/it]



1065it [52:54,  3.37s/it]



1066it [52:57,  3.37s/it]



1067it [53:01,  3.39s/it]



1068it [53:04,  3.37s/it]



1069it [53:07,  3.35s/it]



1070it [53:11,  3.37s/it]



1071it [53:14,  3.35s/it]



1072it [53:18,  3.39s/it]



1073it [53:21,  3.40s/it]



1074it [53:24,  3.38s/it]



1075it [53:28,  3.39s/it]



1076it [53:31,  3.37s/it]



1077it [53:34,  3.36s/it]



1078it [53:38,  3.39s/it]



1079it [53:41,  3.39s/it]



1080it [53:45,  3.37s/it]



1081it [53:48,  3.40s/it]



1082it [53:52,  3.43s/it]



1083it [53:55,  3.48s/it]



1084it [53:59,  3.54s/it]



1085it [54:03,  3.57s/it]



1085it [54:06,  2.99s/it]


KeyboardInterrupt: 

In [9]:
class Classifier_torch:
  def __init__(self, 
                model_name: str,
                max_seq_len: int,
                classes: list[str],
                model_path: str = None,
                dropout: float = 0.1,
                tokenizer: str = None,
                extended: bool = False,
                slot_classes: list[str] = None,
                multi: bool = False
                ):
    self.model_name = model_name
    self.max_seq_len = max_seq_len
    self.classes = classes
    self.model_path = model_path
    self.dropout = dropout
    self.extended = extended
    self.tokenizer = tokenizer
    self.slot_classes = slot_classes
    self.multi = multi

    if not self.tokenizer:
      self.tokenizer = optimus.BertTokenizer.from_pretrained(model_path)
    else:
      self.tokenizer = optimus.BertTokenizer.from_pretrained(self.tokenizer.lower())

    if not self.extended:
      self.bert = optimus.BertModel.from_pretrained(model_name)
      print(self.bert.config.hidden_size)
      self.input_ids = torch.nn.Embedding(len(self.tokenizer), self.bert.config.hidden_size, padding_idx=0)
      self.reshape_layer = torch.nn.functional.adaptive_avgpool1d(dim=1, output_size=1)
      self.dropout_1 = torch.nn.Dropout(dropout)
      self.hidden_out = torch.nn.Linear(self.bert.config.hidden_size, 768)
      self.dropout_2 = torch.nn.Dropout(dropout)
      self.class_out = torch.nn.Linear(768, len(self.classes))
      self.model = torch.nn.Sequential(self.input_ids, self.bert, self.reshape_layer, self.dropout_1, self.hidden_out, self.dropout_2, self.class_out)
    else:
      self.bert = optimus.BertModel.from_pretrained(model_name)
      self.input_ids = torch.nn.Embedding(len(self.tokenizer), self.bert.config.hidden_size, padding_idx=0)
      self.reshape_layer = torch.nn.functional.adaptive_avgpool1d(dim=1, output_size=1)
      self.dropout_1 = torch.nn.Dropout(dropout)
      self.hidden_out = torch.nn.Linear(self.bert.config.hidden_size, 768)
      self.dropout_2 = torch.nn.Dropout(dropout)
      self.intent_classifier = torch.nn.Linear(768, len(self.classes))
      self.slot_classifier = torch.nn.Linear(768, len(self.slot_classes))
      self.model = torch.nn.Sequential(self.input_ids, self.bert, self.reshape_layer, self.dropout_1, self.hidden_out, self.dropout_2)
      self.out_layers = torch.nn.ModuleList([self.intent_classifier, self.slot_classifier])

  def model_metrics(self, optimizer: str = "adam", learning_rate: float = 1e-5, loss: str = "sparse_categorical_crossentropy", metrics: list[str] = ["accuracy"]):
    self.optimizer = getattr(torch.optim, optimizer.lower())(self.model.parameters(), lr=learning_rate)
    self.loss_fn = getattr(torch.nn, loss.lower())()
    self.metrics = [getattr(torch.nn, metric.lower())() for metric in metrics]
    if not self.extended:
      self.model.compile(optimizer=self.optimizer, loss=self.loss_fn, metrics=self.metrics)
    else:
      self.model.compile(optimizer=self.optimizer, loss=self.loss_fn, metrics=self.metrics)
      self.out_loss_fns = [self.loss_fn for _ in range(len(self.out_layers))]
      self.out_metrics = [[metric() for metric in self.metrics] for _ in range(len(self.out_layers))]

  def model_summary(self):
    print(self.model)
    
  def predict(self, text: str):
    encoded_text = self.tokenizer(text, add_special_tokens=True, truncation=True, max_length=self.max_seq_len, return_tensors='pt')
    output = self.model(**encoded_text)
    predictions = torch.nn.functional.softmax(output, dim=1)
    predicted_class = torch.argmax(predictions, dim=1).item()
    return self.classes[predicted_class]

In [5]:
train = pd.concat((train,valid)).reset_index(drop=True)
classes = train.intent.unique().tolist()
len(classes)

7

In [11]:
intent_classifier_model = Classifier_torch("bert", 38, classes, "bert-base-uncased", 0.1, None, False)

OSError: bert is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
intent_classifier_model.model_metrics()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

(None, 7)


In [None]:
intent_classifier_model.model_summary()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 38)]              0         
                                                                 
 tf_bert_model_8 (TFBertMod  TFBaseModelOutputWithPo   109482240 
 el)                         olingAndCrossAttentions             
                             (last_hidden_state=(Non             
                             e, 38, 768),                        
                              pooler_output=(None, 7             
                             68),                                
                              past_key_values=None,              
                             hidden_states=None, att             
                             entions=None, cross_att             
                             entions=None)                       
                                                           