# FINETUNING SENTIMENT CLASSIFICATION MODEL
A notebook presenting how to finetune a pretrained model from huggingface.

In [25]:
from datasets import load_dataset
import numpy as np
from transformers import pipeline,AutoTokenizer,TrainingArguments,AutoModelForSequenceClassification,Trainer
from pprint import pprint
from torchinfo import summary
from datasets import load_metric
import json


In [2]:
raw_datasets = load_dataset("glue","sst2")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
dir(raw_datasets["train"])


['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguous',
 '_select_with_indices_mappin

In [5]:
type(raw_datasets["train"])

datasets.arrow_dataset.Dataset

In [6]:
raw_datasets['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [8]:
raw_datasets['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [9]:
# load the AutoTokenizer pretrained checkpoint
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
# Example of a token 
tokenized_sentences = tokenizer(raw_datasets["train"][0:3]["sentence"])
pprint(tokenized_sentences)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
               [101,
                3397,
                2053,
                15966,
                1010,
                2069,
                4450,
                2098,
                18201,
                2015,
                102],
               [101,
                2008,
                7459,
                2049,
                3494,
                1998,
                10639,
                2015,
                2242,
                2738,
                3376,
                2055,
                2529,
                3267,
                102]]}


In [11]:
# Method to tokenize all the data

def tokenize_fn(batch):
    return tokenizer(batch['sentence'],truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_fn,batched=True)

In [13]:
#Training arguments such as number of epochs and name of the trainer

training_args = TrainingArguments(
    'trainer',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1,
)

In [14]:
# loading the model from checkpoint and specify the num of labels

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)
model

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [16]:
# Method to view the summary of a model
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [17]:
# loading the metrics of the pretrained model.
metric = load_metric("glue","sst2")
# specific models can also be selected i.g. 
# metric = load_metric("f1")

  metric = load_metric("glue","sst2")


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [18]:
# example
metric.compute(predictions=[1,0,1],references=[1,0,0])

{'accuracy': 0.6666666666666666}

In [19]:
# Creating a metric function to be applied during training:

def compute_metrics(logits_and_labels):
    logits,labels = logits_and_labels
    predictions = np.argmax(logits,axis=1)
    return metric.compute(predictions=predictions,references=labels)

In [20]:
# setting up the trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [21]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mharelya[0m ([33mdrumming_project[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/8419 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.4172, 'learning_rate': 4.7030526190759e-05, 'epoch': 0.06}
{'loss': 0.3371, 'learning_rate': 4.4061052381518e-05, 'epoch': 0.12}
{'loss': 0.3385, 'learning_rate': 4.109157857227699e-05, 'epoch': 0.18}
{'loss': 0.3009, 'learning_rate': 3.812210476303599e-05, 'epoch': 0.24}
{'loss': 0.2979, 'learning_rate': 3.515263095379499e-05, 'epoch': 0.3}
{'loss': 0.2767, 'learning_rate': 3.218315714455399e-05, 'epoch': 0.36}
{'loss': 0.2637, 'learning_rate': 2.9213683335312986e-05, 'epoch': 0.42}
{'loss': 0.2559, 'learning_rate': 2.6244209526071984e-05, 'epoch': 0.48}
{'loss': 0.2336, 'learning_rate': 2.3274735716830978e-05, 'epoch': 0.53}
{'loss': 0.2396, 'learning_rate': 2.0305261907589976e-05, 'epoch': 0.59}
{'loss': 0.2301, 'learning_rate': 1.7335788098348973e-05, 'epoch': 0.65}
{'loss': 0.2145, 'learning_rate': 1.4366314289107971e-05, 'epoch': 0.71}
{'loss': 0.2208, 'learning_rate': 1.1396840479866969e-05, 'epoch': 0.77}
{'loss': 0.2123, 'learning_rate': 8.427366670625965e-06, 'epoc

  0%|          | 0/109 [00:00<?, ?it/s]

{'eval_loss': 0.34477877616882324, 'eval_accuracy': 0.9002293577981652, 'eval_runtime': 2.1795, 'eval_samples_per_second': 400.085, 'eval_steps_per_second': 50.011, 'epoch': 1.0}
{'train_runtime': 900.9988, 'train_samples_per_second': 74.749, 'train_steps_per_second': 9.344, 'train_loss': 0.26330541394812884, 'epoch': 1.0}


TrainOutput(global_step=8419, training_loss=0.26330541394812884, metrics={'train_runtime': 900.9988, 'train_samples_per_second': 74.749, 'train_steps_per_second': 9.344, 'train_loss': 0.26330541394812884, 'epoch': 1.0})

In [22]:
#saving the model
trainer.save_model("my_trained_model")

In [27]:
#loading the model
newmodel = pipeline('text-classification',model="my_trained_model",device=0)

In [34]:
#inference the new model
newmodel("I crave Food")

[{'label': 'LABEL_0', 'score': 0.9705724120140076}]

In [35]:
# adding the labels to the model manually
config_path = 'my_trained_model/config.json'
with open(config_path) as f:
    j=json.load(f)
j['id2label'] = {0:'negative',1:'positive'}

with open(config_path,'w') as f:
    json.dump(j,f,indent=2)


In [36]:
# Testing the new labels
newmodel = pipeline('text-classification',model="my_trained_model",device=0)
newmodel("I crave Food")

[{'label': 'negative', 'score': 0.9705724120140076}]

wandb: Network error (ConnectionError), entering retry loop.
