https://www.kaggle.com/code/cdeotte/tensorflow-roberta-0-705/notebook

In [76]:
import pandas as pd
import tensorflow as tf
import transformers
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


Roberta details
<pre>
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
 </pre>


 Example vocab:
 <pre>
 {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3, ".": 4, "Ġthe": 5, ",": 6, "Ġto": 7, "Ġand": 8, "Ġof": 9, "Ġa": 10, "Ġin": 11, "-": 12, "Ġfor": 13, "Ġthat": 14, "Ġon": 15, "Ġis": 16, "âĢ": 17, "'s": 18, "Ġwith": 19, "ĠThe": 20, "Ġwas": 21, "Ġ\"": 22, "Ġat": 23, "Ġit": 24, "Ġas": 25, "Ġsaid": 26, "Ļ": 27, "Ġbe": 28, "s": 29, "Ġby": 30, "Ġfrom": 31, "Ġare": 32, "Ġhave": 33, "Ġhas": 34, ":": 35, "Ġ(": 36, "Ġhe": 37, "ĠI": 38, "Ġhis": 39, "Ġwill": 40, "Ġan": 41, "Ġthis": 42, ")": 43, "ĠâĢ": 44, "Ġnot": 45, "Ŀ": 46, "Ġyou": 47, "ľ": 48, "Ġtheir": 49, "Ġor": 50, "Ġthey": 51, "Ġwe": 52, "Ġbut": 53, "Ġwho": 54, "Ġmore": 55, "Ġhad": 56
 </pre>

In [None]:
Roberta special tokens
# - ['<s>', '</s>', '<unk>', '<pad>', '<mask>']

special token ids
[0, 2, 3, 1, 50264]

In [15]:
tokenizer.build_inputs_with_special_tokens(['x'], ['y'])

[0, 'x', 2, 2, 'y', 2]

In [8]:
tokenizer.all_special_tokens

['<s>', '</s>', '<unk>', '<pad>', '<mask>']

In [10]:
tokenizer.all_special_ids

[0, 2, 3, 1, 50264]

In [7]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Downloading (…)olve/main/vocab.json: 100%|█| 899k/899k [00:0
Downloading (…)olve/main/merges.txt: 100%|█| 456k/456k [00:0
Downloading (…)lve/main/config.json: 100%|█| 481/481 [00:00<


In [52]:
tokenizer('positive')

{'input_ids': [0, 22173, 2], 'attention_mask': [1, 1, 1]}

For some reason, roberta tokenizer from pretrained is different from directly calling ByteLevelBPETokenizer
eg, ID for 'positive' is 22173 in autotokenizer, but 1313 in ByteLevelBPETokenizer

In [77]:
import tokenizers
# does not work
# tokenizers.ByteLevelBPETokenizer.from_pretrained('roberta-base')
ROBERTA_PATH = "/Users/kianyewngieng/.cache/huggingface/hub/models--roberta-base/snapshots/bc2764f8af2e92b6eb5679868df33e224075ca68"
TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab=f"{ROBERTA_PATH}/vocab.json", 
    merges=f"{ROBERTA_PATH}/merges.txt", 
    lowercase=True,
    add_prefix_space=True
)
tokenizer = TOKENIZER

sentiment_id = {'positive': 1313, 'negative':2430, 'neutral': 7974}

train = pd.read_csv('../data/tweet-sentiment-extraction/train.csv').fillna('')

In [65]:
train.shape # 24k sample size for text

(27481, 4)

In [269]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


##  Tokenize train set

In [78]:
MAX_LEN = 96

ct = train.shape[0]
input_ids = np.ones((ct, MAX_LEN), dtype='int32')
attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32')
start_tokens = np.zeros((ct, MAX_LEN), dtype='int32')
end_tokens = np.zeros((ct, MAX_LEN), dtype='int32')

for k in range(train.shape[0]):
    # adds a space in front
    text1  = " " + " ".join(train.loc[k, 'text'].split())
    text2 = " ".join(train.loc[k, 'selected_text'].split())
    # return start index for the substring
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    
    # token_ids for the sample
    chars[idx:idx+len(text2)] = 1
    # fill up the 
    if text1[idx-1]==' ':
        chars[idx-1] = 1
    
    enc = tokenizer.encode(text1)
    
    # id_offsets
    # For each sub-token returned by the tokenizer, 
    # the offset mapping gives us a tuple indicating the 
    # sub-token’s start position and end position relative to
    # the original token it was split from. 
    # That means that if the first position in the tuple is 
    # anything other than 0, we will set its corresponding label to -100. 
    # While we’re at it, we can also set labels to -100 
    # if the second position of the offset mapping is 0, 
    # since this means it must be a special token like [PAD] or [CLS].
    offsets = []
    idx = 0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append([idx, idx+len(w)])
        idx += len(w)
        
    # START END TOKENS
    toks = []
    for i, (a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: 
            toks.append(i)
            
    s_tok = sentiment_id[train.loc[k, 'sentiment']]
    input_ids[k, :len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask[k, :len(enc.ids)+5] = 1
    if len(toks)>0:
        start_tokens[k, toks[0]+1] = 1
        end_tokens[k, toks[-1] + 1] = 1
    
#     if k == 2:
#         break

In [132]:
enc.offsets

[(0, 3), (3, 8), (8, 11), (11, 20), (20, 23), (23, 26)]

In [104]:
text1

' my boss is bullying me...'

In [103]:
enc.ids

[127, 3504, 16, 11902, 162, 734]

In [109]:
text1

' my boss is bullying me...'

In [134]:
text2

'bullying me'

##### tokenizer does not split on beginning of sentence " "?

In [108]:
tokenizer.encode(text1).ids

[127, 3504, 16, 11902, 162, 734]

In [112]:
text1_v2 = text1[1:]
text1_v2

'my boss is bullying me...'

In [111]:
tokenizer.encode(text1_v2).ids

[127, 3504, 16, 11902, 162, 734]

In [128]:
tokenizer.decode([127])

' my'

/end

In [131]:
input_ids[2,:]

array([    0,   127,  3504,    16, 11902,   162,   734,     2,     2,
        2430,     2,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1], dtype=int32)

In [130]:
toks

[3, 4]

In [101]:
offsets

[[0, 3], [3, 8], [8, 11], [11, 20], [20, 23], [23, 26]]

In [102]:
chars

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 0., 0.])

In [133]:
start_tokens[2, :]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [122]:
end_tokens[2]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [None]:
   s_tok = sentiment_id[train.loc[k,'sentiment']]
    input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask[k,:len(enc.ids)+5] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+1] = 1
        end_tokens[k,toks[-1]+1] = 1

Now we arrive at a common obstacle with using pre-trained models for token-level classification: many of the tokens in the W-NUT corpus are not in DistilBert’s vocabulary. Bert and many models like it use a method called WordPiece Tokenization, meaning that single words are split into multiple tokens such that each token is likely to be in the vocabulary. For example, DistilBert’s tokenizer would split the Twitter handle @huggingface into the tokens ['@', 'hugging', '##face']. This is a problem for us because we have exactly one tag per token. If the tokenizer splits a token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels.

One way to handle this is to only train on the tag labels for the first subtoken of a split token. We can do this in 🤗 Transformers by setting the labels we wish to ignore to -100. In the example above, if the label for @HuggingFace is 3 (indexing B-corporation), we would set the labels of ['@', 'hugging', '##face'] to [3, -100, -100].

### tokenize test set

In [79]:
test = pd.read_csv('../data/tweet-sentiment-extraction/test.csv').fillna('')
ct = test.shape[0]
input_ids = np.ones((ct, MAX_LEN), dtype='int32')
attention_mask_t = np.zeros((ct, MAX_LEN), dtype='int32')
token_type_ids_t = np.zeros((ct, MAX_LEN), dtype='int32')

for k in range(test.shape[0]):
    text1 = " " + " ".join(test.loc[k, 'text'].split())
    enc = tokenizer.encode(text1)
    s_tok = sentiment_id[test.loc[k,'sentiment']]
    input_ids[k, :len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask_t[k, len(enc.ids)+5] = 1

In [7]:
test.shape

(3534, 3)

In [80]:
from transformers import AutoModel
bert_model = AutoModel.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [81]:
import torch.nn as nn
import torch
input_ids = torch.from_numpy(input_ids) # , dtype=torch.int64)
attention_mask = torch.from_numpy(attention_mask_t) # , dtype=torch.int64)
token_type_ids = torch.from_numpy(token_type_ids_t) # , dtype=torch.int64)

In [82]:
input_ids.shape

torch.Size([3534, 96])

In [83]:
attention_mask.shape

torch.Size([3534, 96])

In [84]:
token_type_ids.shape

torch.Size([3534, 96])

In [None]:
sample = 2

In [87]:
input_ids[2].shape

torch.Size([96])

In [172]:
x = bert_model(input_ids=input_ids[[2]], 
           attention_mask=attention_mask[[2]],
           token_type_ids=token_type_ids[[2]] )

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0518,  0.0734, -0.0339,  ..., -0.1223, -0.0407, -0.0231],
         [-0.0561,  0.0678, -0.0304,  ..., -0.1155, -0.0411, -0.0226],
         [-0.0516,  0.0701, -0.0300,  ..., -0.1202, -0.0383, -0.0211],
         ...,
         [-0.0671,  0.0620, -0.0265,  ..., -0.1117, -0.0480, -0.0200],
         [-0.0671,  0.0620, -0.0265,  ..., -0.1117, -0.0480, -0.0200],
         [-0.0671,  0.0620, -0.0265,  ..., -0.1117, -0.0480, -0.0200]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.0123, -0.2211, -0.2488, -0.0714,  0.1332,  0.1888,  0.2768, -0.1189,
         -0.0821, -0.1937,  0.2648,  0.0115, -0.1351,  0.0860, -0.1485,  0.4940,
          0.2344, -0.4982,  0.0320, -0.0089, -0.2283,  0.0748,  0.4725,  0.3132,
          0.1108,  0.0729, -0.1433, -0.0355,  0.1917,  0.2477,  0.2759,  0.0387,
          0.1500,  0.2116, -0.2434,  0.0748, -0.3380, -0.0159,  0.2932, -0.1931,
         -0.0731,  0.1764,  0.22

# END

### Comparing Conv1d in pytorch and tensorflow

In [173]:
lhs = x.last_hidden_state
import torch.nn as nn

# weihghts does not match the timestep dimension
x1 = nn.Dropout(0.2)(lhs)
convx1 = nn.Conv1d(in_channels = 768, 
               out_channels=1,
               kernel_size=1,
               stride=1)
x1 = convx1(x1)

In [174]:
convx1.weight.shape

torch.Size([1, 768, 1])

In [120]:
lhs.shape

torch.Size([1, 96, 768])

In [127]:
x = np.random.randn(4, 10, 128)
tf_conv = tf.keras.layers.Conv1D(1,1)
tf_out = tf_conv(x)

In [131]:
nn_conv.weight.shape

torch.Size([1, 128, 1])

In [159]:
nn_conv = nn.Conv1d(128, 1, 1)
nn_conv.weight = nn.Parameter(torch.tensor(tf_conv.get_weights()[0], dtype=torch.float))
nn_out = nn_conv(torch.tensor(x, dtype=torch.float).transpose(-2,-1)).transpose(-2,-1)

In [160]:
nn_out.shape

torch.Size([4, 10, 1])

In [161]:
tf_out.shape

(4, 10, 1)

In [165]:
pt_out = nn_out.detach().numpy()
tf_out = tf_out

In [167]:
pt_out[0]

array([[ 2.1552157 ],
       [-2.577402  ],
       [-0.85579425],
       [-1.0808017 ],
       [-0.09038931],
       [ 1.447653  ],
       [ 0.4062286 ],
       [ 0.38171047],
       [-0.02897149],
       [-0.73734987]], dtype=float32)

In [168]:
tf_out[0]

array([[ 2.2065475 ],
       [-2.5260706 ],
       [-0.8044631 ],
       [-1.0294701 ],
       [-0.0390582 ],
       [ 1.4989845 ],
       [ 0.45756042],
       [ 0.43304208],
       [ 0.02236023],
       [-0.6860184 ]], dtype=float32)

In [135]:
nn_out.shape

torch.Size([4, 1, 128])

In [136]:
tf_out.shape

TensorShape([4, 10, 1])

In [121]:
import tensorflow as tf
input_shape = (4, 10, 128)
x = tf.random.normal(input_shape)
conv = tf.keras.layers.Conv1D(1,1)
out = conv(x)
out.shape


TensorShape([4, 10, 1])

In [125]:
conv.get_weights()[0].shape

(1, 128, 1)

In [45]:
x.shape

TensorShape([4, 10, 128])

In [51]:
conv.get_weights()[0].shape

(1, 128, 1)

In [66]:
x.shape

TensorShape([4, 10, 128])

In [60]:
x[0,0, :].shape

TensorShape([128])

In [68]:
out[0][0]

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.3941005], dtype=float32)>

In [74]:
conv.get_weights()[0].squeeze().shape

(128,)

In [72]:
x[0,0,:].shape

TensorShape([128])

In [75]:
(conv.get_weights()[0].squeeze() * x[0,0, :]).numpy().sum()

0.39410055

In [24]:
x[0,0,:].numpy().sum()

-12.271212

In [28]:
x[0,0,:].numpy().mean()

-0.09586884

In [30]:
x[0,:, 0].numpy().mean()

-0.08947609

In [None]:
x[0,:, 0].numpy().sum()

### Build roberta model (Tensorflow) HAS BUG regarding tensorflow latest versioning

In [184]:
from keras.engine import data_adapter

In [185]:
import tensorflow.python.keras as keras
from keras.engine import data_adapter

In [203]:
ids = tf.keras.layers.Input((MAX_LEN, ), dtype=tf.int32)
att = tf.keras.layers.Input((MAX_LEN, ), dtype=tf.int32)
tok = tf.keras.layers.Input((MAX_LEN, ), dtype=tf.int32)

from transformers import TFRobertaModel
config = RobertaConfig.from_pretrained('config-roberta-base.json')
bert_model = TFRobertaModel.from_pretrained('pretrained-roberta-base.h5',config=config)
x = bert_model(ids,attention_mask=att,token_type_ids=tok)

### References

1. Tokenizer details from HF (refer to this if you want to train own tokenizer)
    - https://huggingface.co/docs/tokenizers/pipeline
    
    
2. difference between autotokenizer.from_pretrained and specifictokenizer.from_pretrained
    - gs: huggingface autotokenizer different from tokenizer
    - https://github.com/huggingface/transformers/issues/5587
    

3. Why do we use offsets?
    - gs: huggingface tokenizer offsets
    -
https://huggingface.co/transformers/v4.2.2/custom_datasets.html#:~:text=For%20each%20sub%2Dtoken%20returned,its%20corresponding%20label%20to%20%2D100%20.


4.Difference between token ids and attention mask
- [ref](https://jaketae.github.io/category/common-sense/#:~:text=Input%20IDs%20are%20obvious%3A%20these,where%20two%20sentences%20are%20given.)

- input IDs are obvious: these are simply mappings between tokens and their respective IDs. The attention mask is to prevent the model from looking at padding tokens. The token type IDs are used typically in a next sentence prediction tasks, where two sentences are given.


5. Error from using TFRobertaModel
- No module named 'keras.engine'
- Due to most latest version of tensorflow ... 
- downloaded the tf_model.h5 and saved in current directory, but does work

6. What doe conv1d(filters=1,kernel_size=1) mean
- This means that we are using 1 filters (output channel), kernel_size=1 means specifies the length of the 1D convolution window.
- Convolution is applied on the second dimension. eg (4, 10, 128) -> convolution is applied on dim=1 (i.e shape=(10,)). 
    - To run sample, need to use the convolution weights

7. When to use crossentropy loss and when to use binary cross entropy loss
- gs: (pytorch cross entropy loss vs binarycrossentropy loss)
- https://medium.com/dejunhuang/learning-day-57-practical-5-loss-function-crossentropyloss-vs-bceloss-in-pytorch-softmax-vs-bd866c8a0d23
    - BinaryCrossEntropy() loss should be used for binary, crossentropy loss should be used for multi-class classification
    - Reason:
       - crossentropy loss will take as input a target y that can take on values (0,C), and expects the predictions to be of shape (batch_size, num_classes)
        - BinaryCrossEntropy loss will take in as input a target y that can take on values (0,1), and expects the predictions to be of shape (batch_size, 1)
        - CrossEntropy can be used for binary classification. 
           - Using sigmoid
               - Final output is shape (batch_size, 2) because crossentropy loss needs to take in predictions with (bsize, num_classes)
               - This means that we use one sigmoid for each 0 and 1 prediction before feeding it into CrossEntropy loss. 
               - But output probabilities will be meaningless. eg σ([-2.34, 3.45])=[8.79%, 96.9%] does not make sense
           - Using softmax
               Final output is also of shape (batch_size, 2) because crossentropy loss requires predictions with shape (bsize,num_classes)
           - However, for binary classification where there are only two classes, the output from softmax tends to always be close to 0 and close to one. Eg. softmax([-2,34, 3,45])=[0.3%, 99.7%]
           - So softmax is only suitable for multi-class classification.
           
- When to use binarycrossentropy, crossentroppy loss
    - TLDR: BCE: multilabel, binary classification, CE: multiclass
- https://stackoverflow.com/questions/59336899/which-loss-function-and-metrics-to-use-for-multi-label-classification-with-very
    - gs: multi label classification binarycross entorpy
- https://towardsdatascience.com/cross-entropy-for-classification-d98e7f974451
    - gs: crossentropy loss for multi label classification
- https://discuss.pytorch.org/t/what-kind-of-loss-is-better-to-use-in-multilabel-classification/32203
    - gs: loss function for multi label classification pytorch

In [None]:
()