<a href="https://colab.research.google.com/github/ilirsheraj/Transformers/blob/main/Sentiment_Fine_Tunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets



In [2]:
from datasets import load_dataset
import numpy as np

In [None]:
# We can use this dataset but its too large. Try on a separate notebook
# https://huggingface.co/datasets/amazon_polarity
# dataset = load_dataset("amazon_polarity")

In [3]:
raw_datasets = load_dataset("glue", "sst2")

In [4]:
# Dictionary showing the structure of the dataset
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [5]:
# Have a look at the training dataset
raw_datasets["train"]

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [6]:
# See the methods and attributes of the object
dir(raw_datasets["train"])

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguous',
 '_select_with_indices_mappin

In [7]:
# check the type of the object at hand: Good for documentation
type(raw_datasets["train"])

datasets.arrow_dataset.Dataset

In [8]:
raw_datasets["train"].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [9]:
# We can index it like a list
raw_datasets["train"][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [10]:
raw_datasets["train"][50000:50003]

{'sentence': ['glow ',
  'a classical dramatic animated feature ',
  'best espionage picture '],
 'label': [1, 1, 1],
 'idx': [50000, 50001, 50002]}

In [11]:
# Have a look at the features
raw_datasets["train"].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [12]:
from transformers import AutoTokenizer

In [13]:
# Define the model: stilbert trains faster
# checkpoint = "bert-base-uncased"
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [14]:
tokenized_sentences = tokenizer(raw_datasets["train"][0:3]["sentence"])

from pprint import pprint
pprint(tokenized_sentences)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
               [101,
                3397,
                2053,
                15966,
                1010,
                2069,
                4450,
                2098,
                18201,
                2015,
                102],
               [101,
                2008,
                7459,
                2049,
                3494,
                1998,
                10639,
                2015,
                2242,
                2738,
                3376,
                2055,
                2529,
                3267,
                102]]}


In [18]:
def tokenize_fn(batch):
    return tokenizer(batch["sentence"], truncation=True)

In [19]:
# Pass the argument 'truncation=True'
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

In [20]:
from transformers import TrainingArguments

In [21]:
training_args = TrainingArguments("my_trainer",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=1)

In [22]:
from transformers import AutoModelForSequenceClassification

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [25]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [26]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [27]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [28]:
# Parameters before training
# This is a sanity check to make sure we have trained the whole model
params_before = []
for name, p in model.named_parameters():
    params_before.append(p.detach().cpu().numpy())

In [30]:
params_before[:2]

[array([[-0.01664949, -0.06661227, -0.01632868, ..., -0.01999032,
         -0.05139988, -0.0263568 ],
        [-0.01319846, -0.06733431, -0.01605646, ..., -0.0226614 ,
         -0.05537301, -0.02600443],
        [-0.01759106, -0.07094341, -0.01443494, ..., -0.02457913,
         -0.05956192, -0.0231829 ],
        ...,
        [-0.0231029 , -0.05878259, -0.01048967, ..., -0.01945743,
         -0.02615411, -0.02118432],
        [-0.0490171 , -0.05614787, -0.00465348, ..., -0.01065376,
         -0.01797333, -0.02187675],
        [-0.00646111, -0.0914881 , -0.00254872, ..., -0.01505679,
         -0.05040044,  0.04597744]], dtype=float32),
 array([[ 1.7505383e-02, -2.5631009e-02, -3.6641564e-02, ...,
          3.3437202e-05,  6.8312453e-04,  1.5441139e-02],
        [ 7.7580423e-03,  2.2613001e-03, -1.9444324e-02, ...,
          2.8909724e-02,  2.9752752e-02, -5.3246655e-03],
        [-1.1287465e-02, -1.9644140e-03, -1.1572698e-02, ...,
          1.4907907e-02,  1.8740905e-02, -7.3139993e-03]

In [31]:
# Import the Trainer class
from transformers import Trainer
from datasets import load_metric

In [32]:
# Call the metric function
metric = load_metric("glue", "sst2")

  metric = load_metric("glue", "sst2")


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [33]:
# expect 66.6 accuracy: 2/3
metric.compute(predictions=[1,0,1], references=[1,0,0])

{'accuracy': 0.6666666666666666}

In [43]:
# define a compute metric function
def compute_metrics(logits_and_labels):
    # metric = load_metric("glue", "sst2")
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [44]:
trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
)

In [45]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1807,0.447572,0.894495


TrainOutput(global_step=8419, training_loss=0.0975096365351733, metrics={'train_runtime': 477.8114, 'train_samples_per_second': 140.953, 'train_steps_per_second': 17.62, 'total_flos': 518596929468840.0, 'train_loss': 0.0975096365351733, 'epoch': 1.0})

In [46]:
trainer.save_model("my_saved_model")

In [47]:
# Look for the model
!ls

my_saved_model	my_trainer  sample_data


In [48]:
!ls my_saved_model

added_tokens.json  pytorch_model.bin	    tokenizer_config.json  training_args.bin
config.json	   special_tokens_map.json  tokenizer.json	   vocab.txt


In [49]:
from transformers import pipeline

In [51]:
# Use GPU
newmodel = pipeline("text-classification", model="my_saved_model", device=0)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [52]:
newmodel("This movie is great!")

[{'label': 'LABEL_1', 'score': 0.9993317723274231}]

In [53]:
newmodel("This movie sucks!")

[{'label': 'LABEL_0', 'score': 0.9956342577934265}]

In [54]:
# Apparently label 1 is positive and 0 is negative
!cat my_saved_model/config.json

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.34.0",
  "vocab_size": 30522
}


In [55]:
import json

In [56]:
# Add labels as positive (1) and negative (0)
config_path = "my_saved_model/config.json"
with open(config_path) as f:
    j = json.load(f)

j["id2label"] = {0: "negative", 1: "positive"}

with open(config_path, "w") as f:
    json.dump(j, f, indent=2)

In [57]:
# Check the labels
!cat my_saved_model/config.json

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.34.0",
  "vocab_size": 30522,
  "id2label": {
    "0": "negative",
    "1": "positive"
  }
}

In [58]:
# Run again and see
newmodel = pipeline("text-classification", model="my_saved_model", device=0)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [59]:
newmodel("This movie is great!")

[{'label': 'positive', 'score': 0.9993317723274231}]

In [60]:
# Get all parameters after training
params_after = []
for name, p in model.named_parameters():
    params_after.append(p.detach().cpu().numpy())

In [61]:
# We expect parameters to be non-zero
for p1, p2 in zip(params_before, params_after):
    print(np.sum(np.abs(p1 - p2)))

28990.592
160.2184
2.7722688
1.9779114
2208.0889
2.575203
2218.5125
0.004108212
2014.4374
1.8077519
1961.0247
1.3654768
2.880487
1.3526096
8428.852
8.864708
7866.233
1.1336356
2.6692069
1.3467124
2141.4019
2.5275674
2159.4102
0.0046377545
1973.3915
1.4730713
1906.6742
1.1739159
2.8097363
1.1918786
8366.727
8.530121
7774.637
1.1204177
2.5001109
1.3019423
2138.9343
2.682076
2139.5369
0.003972491
1930.8479
1.22681
1912.898
1.161828
2.7285771
1.2786924
8391.866
8.897813
7651.4307
1.1144354
2.4846454
1.0973437
2120.2808
2.364664
2158.109
0.004271179
1979.1726
1.1683881
1903.6853
1.146492
2.5626626
1.1801085
8301.223
9.305192
7386.43
1.2599852
2.3492184
0.97808784
2077.1038
2.5775783
2045.8901
0.0031077345
1778.1763
1.1015389
1754.7693
1.5312967
2.356783
1.5475953
7935.4023
9.263231
6814.6616
1.6033849
2.2983162
1.7476771
1942.5835
2.385489
1963.5518
0.0018186525
1622.7856
2.0937417
1646.8353
2.689726
2.511653
2.7619553
6618.7944
9.328484
6691.805
2.2844505
4.0378685
1.333447
1527.9478
1.822

In [64]:
from google.colab import files
files.download("my_saved_model")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>