In [176]:
import tensorflow as tf
from transformers import AutoTokenizer,TFAutoModel,pipeline,TFAutoModelForSequenceClassification

In [177]:
tokenizer=AutoTokenizer.from_pretrained('bert-base-cased')

In [178]:
tokens=tokenizer.tokenize('Using a Transformer network is simple')
print(f'tokens : {tokens}')
input_ids=tokenizer.convert_tokens_to_ids(tokens)
print(f'input ids : {input_ids}')
final_inputs=tokenizer.prepare_for_model(input_ids)
print(f"final input ids :{final_inputs['input_ids']}")

tokens : ['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']
input ids : [7993, 170, 13809, 23763, 2443, 1110, 3014]
final input ids :[101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102]


In [179]:
#to show how the tokenizer transformed sentence into tokens
decoded_string=tokenizer.decode(final_inputs['input_ids'])

In [180]:
decoded_string

'[CLS] Using a Transformer network is simple [SEP]'

In [181]:
##Experiment with Auto Model and custom toknization, attention, tensors

In [182]:
checkpoint='distilbert-base-uncased-finetuned-sst-2-english'
distbert_tokenizer=AutoTokenizer.from_pretrained(checkpoint)
distbert_model=TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_138']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [183]:
sents=["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]

In [184]:
tokens=distbert_tokenizer(sents,padding=True,truncation=True,return_tensors='tf')

In [185]:
tokens

{'input_ids': <tf.Tensor: shape=(2, 16), dtype=int32, numpy=
array([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,
        12172,  2607,  2026,  2878,  2166,  1012,   102],
       [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,
            0,     0,     0,     0,     0,     0,     0]])>, 'attention_mask': <tf.Tensor: shape=(2, 16), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])>}

In [186]:
outputs_logits=distbert_model(tokens).logits

In [187]:
outputs_logits

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-1.5606962,  1.6122811],
       [ 4.1692314, -3.3464477]], dtype=float32)>

In [188]:
tf.argmax(tf.math.softmax(outputs_logits))

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([1, 0], dtype=int64)>

In [189]:
distbert_model.config.id2label


{0: 'NEGATIVE', 1: 'POSITIVE'}

### Using Tokenization, attention mask,

In [190]:
tokenizer2=AutoTokenizer.from_pretrained(checkpoint)

In [191]:
model2=TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_158']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [192]:
for sent in sents:
    print(sent)

I've been waiting for a HuggingFace course my whole life.
I hate this so much!


In [193]:
tokens=[tokenizer2.tokenize(sent) for sent in sents]
input_ids=[tokenizer.convert_tokens_to_ids(token) for token in tokens]

In [194]:
print(tokens,input_ids)

[['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.'], ['i', 'hate', 'this', 'so', 'much', '!']] [[178, 112, 1396, 1151, 2613, 1111, 170, 19558, 10931, 1736, 1139, 2006, 1297, 119], [178, 4819, 1142, 1177, 1277, 106]]


In [195]:
for input_id in input_ids:
    print(len(input_id),input_id)

14 [178, 112, 1396, 1151, 2613, 1111, 170, 19558, 10931, 1736, 1139, 2006, 1297, 119]
6 [178, 4819, 1142, 1177, 1277, 106]


In [196]:
input_ids[1]=input_ids[1]+[0,0,0,0,0,0,0,0]


In [197]:
for input_id in input_ids:
    print(len(input_id),input_id)

14 [178, 112, 1396, 1151, 2613, 1111, 170, 19558, 10931, 1736, 1139, 2006, 1297, 119]
14 [178, 4819, 1142, 1177, 1277, 106, 0, 0, 0, 0, 0, 0, 0, 0]


In [198]:
input_ids=tf.constant(input_ids)

In [199]:
input_ids

<tf.Tensor: shape=(2, 14), dtype=int32, numpy=
array([[  178,   112,  1396,  1151,  2613,  1111,   170, 19558, 10931,
         1736,  1139,  2006,  1297,   119],
       [  178,  4819,  1142,  1177,  1277,   106,     0,     0,     0,
            0,     0,     0,     0,     0]])>

In [200]:
output_logits=model2(input_ids).logits ## see output logits and this output logits are different, this is since model giving attentions 0s as separate features. ideally it should be 

In [204]:
output_logits

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[ 1.6735691 , -1.4508451 ],
       [ 1.0350972 , -0.92355907]], dtype=float32)>

In [201]:
#lets give attension_mask
attention_mask1=tf.constant(tf.ones((1,14),dtype=tf.int64))

In [202]:
attention_mask2=tf.concat([tf.ones((1,7),dtype=tf.int64),tf.zeros((1,7),dtype=tf.int64)],axis=1)

In [203]:
attention_mask=tf.concat([attention_mask1,attention_mask2],axis=0)

In [205]:
attention_mask

<tf.Tensor: shape=(2, 14), dtype=int64, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)>

In [206]:
output_logits=model2(input_ids,attention_mask=attention_mask).logits

In [207]:
output_logits

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[ 1.6735691, -1.4508451],
       [ 1.3126377, -1.1695657]], dtype=float32)>

In [208]:
predictions=tf.math.softmax(output_logits)

In [209]:
predictions

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.9578886 , 0.04211135],
       [0.92288476, 0.07711524]], dtype=float32)>

In [210]:
tf.argmax(predictions)

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([0, 1], dtype=int64)>

In [211]:
model2.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}