<a href="https://colab.research.google.com/github/hamednasr/transformers/blob/main/fite_tuning_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install transformers datasets

In [2]:
from datasets import load_dataset, load_metric
from transformers import (AutoTokenizer,
                          TrainingArguments,
                          Trainer,
                          AutoModelForSequenceClassification,
                          pipeline,
                          AutoConfig)
import numpy as np

# with a hugging face dataset

In [3]:
raw_datasets = load_dataset('glue','sst2')
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
raw_datasets['test']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 1821
})

In [5]:
pprint()

Pretty printing has been turned OFF


In [6]:
dir(raw_datasets['test'])

['_TF_DATASET_REFS', '__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getitems__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_build_local_temp_path', '_check_index_is_initialized', '_data', '_estimate_nbytes', '_fingerprint', '_format_columns', '_format_kwargs', '_format_type', '_generate_tables_from_cache_file', '_generate_tables_from_shards', '_get_cache_file_path', '_get_output_signature', '_getitem', '_indexes', '_indices', '_info', '_map_single', '_new_dataset_with_indices', '_output_all_columns', '_push_parquet_shards_to_hub', '_save_to_disk_single', '_select_contiguous', '_select_with_indices_mapping', '_split', 'add_column', 'add_elasticsearch_index', '

In [7]:
raw_datasets['test']['sentence'][:3]

['uneasy mishmash of styles and genres .', "this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation .", 'by the end of no such thing the audience , like beatrice , has a watchful affection for the monster .']

In [8]:
raw_datasets['test']['label'][:3]

[-1, -1, -1]

In [9]:
type(raw_datasets['test'])

<class 'datasets.arrow_dataset.Dataset'>

In [10]:
raw_datasets['test'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["uneasy mishmash of styles and genres .","this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation .","by the end of no such thing the audience , like beatrice , has a watchful affection for the monster .","director rob marshall went out gunning to make a great one .","lathan and diggs have considerable personal charm , and their screen rapport makes the old story seem new .",...,"what it lacks in originality it makes up for in intelligence and b-grade stylishness .","the acting alone is worth the price of admission .","it 's one of the saddest films i have ever seen that still manages to be uplifting but not overly sentimental .","the only thing that could possibly make them less interesting than they already are is for them to get full montied into a scrappy , jovial team .","... a big , baggy , sprawling carni

In [11]:
raw_datasets['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [12]:
set(raw_datasets['train']['label'])

{0, 1}

In [13]:
raw_datasets['test'][0]

{'sentence': 'uneasy mishmash of styles and genres .', 'label': -1, 'idx': 0}

In [14]:
raw_datasets['test'].features

{'sentence': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None), 'idx': Value(dtype='int32', id=None)}

In [15]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [16]:
raw_datasets['test'][:2]['sentence']

['uneasy mishmash of styles and genres .', "this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation ."]

In [17]:
tokenizer(raw_datasets['test'][0:2]['sentence'])

{'input_ids': [[101, 15491, 28616, 22444, 4095, 1997, 6782, 1998, 11541, 1012, 102], [101, 2023, 2143, 1005, 1055, 3276, 2000, 5025, 6980, 2003, 1996, 2168, 2004, 2054, 4234, 1011, 3392, 19311, 2075, 1999, 1037, 12509, 2064, 2003, 2000, 5025, 4586, 1024, 1037, 3532, 1011, 1011, 2065, 25634, 1011, 1011, 20017, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [18]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [19]:
training_args = TrainingArguments(
                                  output_dir = 'my_trainer',
                                  evaluation_strategy = 'epoch',
                                  save_strategy = 'epoch',
                                  num_train_epochs=1
                              )

In [None]:
# pip install accelerate -U

In [21]:
metric = load_metric('glue','sst2')

  metric = load_metric('glue','sst2')


In [22]:
config = AutoConfig.from_pretrained('distilbert-base-uncased')
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.33.3",
  "vocab_size": 30522
}


In [None]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                           num_labels=2,
                                                           config = config)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
type(model)

<class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'>

In [None]:
!pip -q install torchinfo

In [None]:
from torchinfo import summary

In [None]:
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [None]:
#dummy code:
metric.compute(predictions=[1,1,1], references=[1,0,0])

{'accuracy': 0.3333333333333333}

In [None]:
def compute_metrics(logits_labels):
  logits, labels = logits_labels
  predictions = np.argmax(logits, axis = -1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation'],
    compute_metrics = compute_metrics,
    tokenizer = tokenizer
)

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2193,0.343674,0.908257


TrainOutput(global_step=8419, training_loss=0.26746820426494156, metrics={'train_runtime': 514.7267, 'train_samples_per_second': 130.844, 'train_steps_per_second': 16.356, 'total_flos': 518596929468840.0, 'train_loss': 0.26746820426494156, 'epoch': 1.0})

In [None]:
trainer.save_model('my_model')

In [None]:
!ls

my_model  my_trainer  sample_data


In [None]:
from transformers import pipeline

In [None]:
newmodel = pipeline('text-classification', model = 'my_model', device=0)

In [None]:
newmodel('this was  a very good movie!')

[{'label': 'LABEL_1', 'score': 0.9992859959602356}]

In [None]:
newmodel('this was a terrible movie!')

[{'label': 'LABEL_0', 'score': 0.9977071285247803}]

In [None]:
newmodel('this was not a very good movie!')

[{'label': 'LABEL_0', 'score': 0.998497486114502}]

# with a custom dataset

In [20]:
!wget -nc https://lazyprogrammer.me/course_files/AirlineTweets.csv

--2023-09-30 13:17:49--  https://lazyprogrammer.me/course_files/AirlineTweets.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.67.213.166, 104.21.23.210, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.67.213.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3421431 (3.3M) [text/csv]
Saving to: ‘AirlineTweets.csv’


2023-09-30 13:17:50 (5.87 MB/s) - ‘AirlineTweets.csv’ saved [3421431/3421431]



In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score

In [22]:
df = pd.read_csv('AirlineTweets.csv')
df

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,569587686496825344,positive,0.3487,,0.0000,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,
14636,569587371693355008,negative,1.0000,Customer Service Issue,1.0000,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0000,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0000,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)


In [23]:
df = df[['airline_sentiment','text']]
df

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [24]:
df['target'] = df['airline_sentiment'].map({'positive':1, 'negative':-1, 'neutral':0})
df = df[['text','target']]
# the column name should be label
df.columns = ['sentence','label']
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'] = df['airline_sentiment'].map({'positive':1, 'negative':-1, 'neutral':0})


Unnamed: 0,sentence,label
0,@VirginAmerica What @dhepburn said.,0
1,@VirginAmerica plus you've added commercials t...,1
2,@VirginAmerica I didn't today... Must mean I n...,0
3,@VirginAmerica it's really aggressive to blast...,-1
4,@VirginAmerica and it's a really big bad thing...,-1
...,...,...
14635,@AmericanAir thank you we got on a different f...,1
14636,@AmericanAir leaving over 20 minutes Late Flig...,-1
14637,@AmericanAir Please bring American Airlines to...,0
14638,"@AmericanAir you have my money, you change my ...",-1


In [25]:
df.to_csv('data.csv', index=None)

In [26]:
pd.read_csv('data.csv')

Unnamed: 0,sentence,label
0,@VirginAmerica What @dhepburn said.,0
1,@VirginAmerica plus you've added commercials t...,1
2,@VirginAmerica I didn't today... Must mean I n...,0
3,@VirginAmerica it's really aggressive to blast...,-1
4,@VirginAmerica and it's a really big bad thing...,-1
...,...,...
14635,@AmericanAir thank you we got on a different f...,1
14636,@AmericanAir leaving over 20 minutes Late Flig...,-1
14637,@AmericanAir Please bring American Airlines to...,0
14638,"@AmericanAir you have my money, you change my ...",-1


In [28]:
#if have multiple csv files: load_dataset('csv',data_files=['data.csv','data2.csv'])
#if already have train test split: load_dataset('csv',data_files={'train':['data.csv','data2.csv'],
                                                          #          'test':'test.csv'})

raw_dataset = load_dataset('csv',data_files='data.csv')
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 14640
    })
})

In [29]:
split = raw_dataset['train'].train_test_split(test_size=0.2, seed=12)
split

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 11712
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2928
    })
})

In [30]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [31]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)

In [32]:
tokenized_dataset = split.map(tokenize_fn, batched= True)

Map:   0%|          | 0/11712 [00:00<?, ? examples/s]

Map:   0%|          | 0/2928 [00:00<?, ? examples/s]

In [33]:
training_args = TrainingArguments(
    output_dir = 'training_dir',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    num_train_epochs = 1
)

In [34]:
metric = load_metric('glue','sst2')

  metric = load_metric('glue','sst2')


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [58]:
config  = AutoConfig.from_pretrained('distilbert-base-cased')
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.33.3",
  "vocab_size": 28996
}


In [59]:
config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [60]:
config.label2id

{'LABEL_0': 0, 'LABEL_1': 1}

In [61]:
target_map = {'positive':1, 'negative':-1, 'neutral':0}

In [62]:
config.id2label = {v:k for k,v in target_map.items()}
config.label2id = target_map

In [63]:
config.label2id

{'positive': 1, 'negative': -1, 'neutral': 0}

In [64]:
config.id2label

{1: 'positive', -1: 'negative', 0: 'neutral'}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', config = config)

In [36]:
def compute_metrics(logits_labels):
  logits, labels = logits_labels
  predictions = np.argmax(logits, axis = -1)
  accuracy = np.mean(predictions == labels)
  f1 = f1_score(labels, predictions)
  return {'accuracy':accuracy, 'f1-score':f1}

In [37]:
trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation'],
    compute_metrics = compute_metrics,
    tokenizer = tokenizer
)

In [38]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1-score
1,0.2159,0.333232,0.909404,0.911732


TrainOutput(global_step=8419, training_loss=0.2690318286921579, metrics={'train_runtime': 459.8532, 'train_samples_per_second': 146.458, 'train_steps_per_second': 18.308, 'total_flos': 518606177908860.0, 'train_loss': 0.2690318286921579, 'epoch': 1.0})

In [39]:
trainer.save_model('my_model')

In [40]:
newmodel = pipeline('text-classification', model = 'my_model', device=0)

In [41]:
newmodel('that is not my cup of tea!')

[{'label': 'LABEL_0', 'score': 0.9679570198059082}]