<a href="https://colab.research.google.com/github/jhuarancca/Applied-AI-Building-NLP-Apps-with-Hugging-Face-Transformers/blob/main/Code_04_XX_Building_a_custom_Model_with_Transfer_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import transformers

#Set to avoid warning messages.
transformers.logging.set_verbosity_error()


In [3]:

!pip install datasets


Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any.

## 04.02. Loading a Hugging Face Dataset

In [5]:
from datasets import load_dataset

#Use pretrained model checkpoint from Huggingface
model_name = "distilbert-base-uncased"
#Use pre-labeled dataset from huggingface
dataset_name= "poem_sentiment"

poem_sentiments = load_dataset(dataset_name)

#Apache Arrow format
print(poem_sentiments)
print(poem_sentiments["test"][20:25])

print("\nSentiment Labels used",
      poem_sentiments["train"].features.get("label").names)

Downloading data:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

TypeError: http_get() got an unexpected keyword argument 'displayed_filename'

## 04.03. Encoding and pre-processing the dataset

In [6]:
#Encoding text

from transformers import DistilBertTokenizer

db_tokenizer = DistilBertTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return db_tokenizer(batch["verse_text"],
                        padding=True,
                        truncation=True)

enc_poem_sentiment = poem_sentiments.map(
                        tokenize,
                        batched=True,
                        batch_size=None)

print(enc_poem_sentiment["train"][0:5])


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

NameError: name 'poem_sentiments' is not defined

In [None]:
#Explore input IDs and Attention Mask

print("Text :",
      enc_poem_sentiment["train"][1].get("verse_text"))
print("\nInput Map :",
      enc_poem_sentiment["train"][1].get("input_ids"))
print("\nAttention Mask :",
      enc_poem_sentiment["train"][1].get("attention_mask"))

print("\nTotal tokens: ",
      len(enc_poem_sentiment["train"][1].get("input_ids")))
print("Non Zero tokens: ",
      len(list(filter(
        lambda x :x > 0,
          enc_poem_sentiment["train"][1].get("input_ids")))))
print("Attention = 1: ",
      len(list(filter(
        lambda x :x > 0,
          enc_poem_sentiment["train"][1].get("attention_mask")))))

Text : it flows so long as falls the rain,

Input Map : [101, 2009, 6223, 2061, 2146, 2004, 4212, 1996, 4542, 1010, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Attention Mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Total tokens:  28
Non Zero tokens:  11
Attention = 1:  11


In [None]:
#Separate training and validation sets
training_dataset = enc_poem_sentiment["train"]
validation_dataset=enc_poem_sentiment["validation"]

print("\nColumn Names : ",training_dataset.column_names)
print("\nFeatures : ",training_dataset.features)

labels = training_dataset.features.get("label")
num_labels=len(labels.names)



Column Names :  ['id', 'verse_text', 'label', 'input_ids', 'attention_mask']

Features :  {'id': Value(dtype='int32', id=None), 'verse_text': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive', 'no_impact', 'mixed'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


## 04.04. Creating the Model Architecture

In [None]:
from transformers import TFAutoModelForSequenceClassification

#Load transformer checkpoint from huggingface
sentiment_model = (TFAutoModelForSequenceClassification
            .from_pretrained(model_name, num_labels=num_labels))

sentiment_model.get_config()


{'vocab_size': 30522,
 'max_position_embeddings': 512,
 'sinusoidal_pos_embds': False,
 'n_layers': 6,
 'n_heads': 12,
 'dim': 768,
 'hidden_dim': 3072,
 'dropout': 0.1,
 'attention_dropout': 0.1,
 'activation': 'gelu',
 'initializer_range': 0.02,
 'qa_dropout': 0.1,
 'seq_classif_dropout': 0.2,
 'return_dict': True,
 'output_hidden_states': False,
 'output_attentions': False,
 'torchscript': False,
 'torch_dtype': None,
 'use_bfloat16': False,
 'tf_legacy_loss': False,
 'pruned_heads': {},
 'tie_word_embeddings': True,
 'is_encoder_decoder': False,
 'is_decoder': False,
 'cross_attention_hidden_size': None,
 'add_cross_attention': False,
 'tie_encoder_decoder': False,
 'max_length': 20,
 'min_length': 0,
 'do_sample': False,
 'early_stopping': False,
 'num_beams': 1,
 'num_beam_groups': 1,
 'diversity_penalty': 0.0,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'typical_p': 1.0,
 'repetition_penalty': 1.0,
 'length_penalty': 1.0,
 'no_repeat_ngram_size': 0,
 'encoder_no_repeat_ng

In [None]:
#Freeze the first layer if needed
sentiment_model.layers[0].trainable = True

#Add/remove layers if needed.
#sentiment_model.layers [append()/insert()/remove()]

print(sentiment_model.summary())



Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,956,548
Trainable params: 66,956,548
Non-trainable params: 0
_________________________________________________________________
None


# 04.05. Training the Sentiment Model

In [None]:
#Using features from a pretrained model

import tensorflow as tf

batch_size=64
tokenizer_columns = db_tokenizer.model_input_names

# Convert to TF tensors
train_dataset = training_dataset.to_tf_dataset(
    columns=tokenizer_columns, label_cols=["label"], shuffle=True,
    batch_size=batch_size)
val_dataset = validation_dataset.to_tf_dataset(
    columns=tokenizer_columns, label_cols=["label"], shuffle=False,
    batch_size=batch_size)

sentiment_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy())

sentiment_model.fit(train_dataset,
                    validation_data=val_dataset,
                    epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8119af8160>

## 04.06. Predicting Sentiment with the Custom Model

In [None]:
from datasets import Dataset,DatasetDict

#Input data for interference to predict sentiment
# the "label" array is not needed for inference, but added to provide true labels for comparison
infer_data = {'id':[0,1],
             'verse_text':['and be glad in the summer morning when the kindred ride on their way',
                           'how hearts were answering to his own'],
             'label':[1,0]}

infer_dataset = Dataset.from_dict(infer_data)

ds_dict=DatasetDict()
ds_dict["infer"] = infer_dataset

print(ds_dict)

#Encode the dataset, similar to training
enc_dataset=ds_dict.map(tokenize, batched=True, batch_size=None)

#Convert to Tensors
infer_final_dataset = enc_dataset["infer"].to_tf_dataset(
    columns=tokenizer_columns,  shuffle=True,
    batch_size=batch_size)

print(infer_final_dataset)

#Predict with the model
predictions=sentiment_model.predict(infer_final_dataset)

DatasetDict({
    infer: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 2
    })
})


100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 189.46ba/s]


<PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, 17), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 17), dtype=tf.int64, name=None)}>


In [None]:
predictions.logits

array([[-2.3346035,  4.137719 , -1.7129751, -1.0897871],
       [-2.544504 ,  3.4136968, -0.5179995, -1.3775116]], dtype=float32)

In [None]:
import numpy as np
pred_label_ids=np.argmax(predictions.logits, axis=1)

for i in range(len(pred_label_ids)):
    print("Poem =", infer_data["verse_text"][i],
          " Predicted=",labels.names[pred_label_ids[i]],
          " True-Label=",labels.names[infer_data["label"][i]])

Poem = and be glad in the summer morning when the kindred ride on their way  Predicted= positive  True-Label= positive
Poem = how hearts were answering to his own  Predicted= positive  True-Label= negative
