In [34]:
! pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [35]:
import transformers

print(transformers.__version__)

4.28.1


In [36]:
task = "sst2"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [37]:
from datasets import load_dataset, load_metric

In [38]:
actual_task = task
dataset = load_dataset("tweet_eval", "sentiment")
metric = load_metric('glue', actual_task)



  0%|          | 0/3 [00:00<?, ?it/s]

In [39]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [40]:
dataset["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 45615
})

In [41]:
dataset["train"] = dataset["train"].filter(lambda example: example['label']!=1)
dataset["validation"] = dataset["validation"].filter(lambda example: example['label']!=1)



In [42]:
def change(example):
  if example['label'] == 2:
    example['label'] = 1
  return example

dataset["train"] = dataset["train"].map(change)
dataset["validation"] = dataset["validation"].map(change)



In [43]:
dataset["train"][0:5]

{'text': ['"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"',
  '@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017"',
  "@user LIT MY MUM 'Kerry the louboutins I wonder how many Willam owns!!! Look Kerry Warner Wednesday!'",
  '"\\"""" SOUL TRAIN\\"""" OCT 27 HALLOWEEN SPECIAL ft T.dot FINEST rocking the mic...CRAZY CACTUS NIGHT CLUB ..ADV ticket $10 wt out costume $15..."',
  'So disappointed in wwe summerslam! I want to see john cena wins his 16th title'],
 'label': [1, 1, 1, 1, 0]}

In [44]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [45]:
show_random_elements(dataset["validation"])

Unnamed: 0,text,label
0,@user Did you watch the 1st episode of American Horror Story yet so you can understand why I refuse to watch the rest of it ?!?!?,0
1,@user Do you watch Styled To Rock? Fucking lav it! Oh also let's do something on Saturday then I'm going back to Sheff :) x,1
2,"""It may cost more, but the new @user Moto G is still a damn fine smartphone. Full review:",1
3,I think I'm gonna wear leggings tomorrow with my Jurassic Park shirt pls send clothing advice,1
4,I just realized that I'm seeing Ed Sheeran in concert on Thursday. Yeah still don't believe it's that close @user,1
5,"""Zayn dedicated his 1st solo award to the boys &amp; the boys' to Zayn :"""") #5monthswithoutzayn #MTVHottest One Direction",1
6,But excited for the scramble at knollwood tomorrow morning. Going to tear the west course up,1
7,It is reality that ISIS are on the march in Turkey and Erdogan can't wait to receive them with open arms,0
8,It was a WILD night at @user Jazz at the Bistro. Amy Schumer &amp; cast dug our 2nd set &amp; had a comedy jam- Epic!,1
9,"""// So, I thought Ant-Man was out of the question for the cinema Saturday but my boyfriend said the films he's seen are 'awesome' and that -""",1


In [46]:
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

In [47]:
import numpy as np

fake_preds = np.random.randint(0, 2, size=(64,))
fake_labels = np.random.randint(0, 2, size=(64,))
metric.compute(predictions=fake_preds, references=fake_labels)

{'accuracy': 0.5625}

In [48]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [49]:
task_to_keys = {
    "sst2": ("text", None)
}

In [50]:
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")

Sentence: "QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"


In [51]:
def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

In [52]:
preprocess_function(dataset['train'][:5])

{'input_ids': [[101, 1000, 1053, 2102, 1030, 5310, 1999, 1996, 2434, 4433, 1997, 1996, 5504, 2338, 1010, 2128, 7606, 11320, 8091, 5175, 1996, 2645, 1997, 27589, 18367, 2015, 1012, 1001, 3407, 17706, 2705, 10259, 28578, 2271, 7630, 8091, 1000, 102], [101, 1030, 5310, 2632, 7405, 3406, 1024, 10506, 2097, 15697, 5018, 2454, 1999, 2254, 1010, 2178, 3263, 1999, 1996, 2621, 1998, 3488, 2000, 3288, 6752, 2072, 2011, 2418, 1000, 102], [101, 1030, 5310, 5507, 2026, 12954, 1005, 11260, 1996, 10223, 5092, 21823, 3619, 1045, 4687, 2129, 2116, 2097, 3286, 8617, 999, 999, 999, 2298, 11260, 6654, 9317, 999, 1005, 102], [101, 1000, 1032, 1000, 1000, 1000, 1000, 3969, 3345, 1032, 1000, 1000, 1000, 1000, 13323, 2676, 14414, 2569, 3027, 1056, 1012, 11089, 10418, 14934, 1996, 23025, 1012, 1012, 1012, 4689, 23265, 2305, 2252, 1012, 1012, 4748, 2615, 7281, 1002, 2184, 1059, 2102, 2041, 9427, 1002, 2321, 1012, 1012, 1012, 1000, 102], [101, 2061, 9364, 1999, 11700, 10945, 10278, 999, 1045, 2215, 2000, 2156, 2

In [53]:
encoded_dataset = dataset.map(preprocess_function, batched=True)



In [54]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 2 
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

In [55]:
metric_name = "accuracy"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

In [56]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return metric.compute(predictions=predictions, references=labels)

In [57]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [58]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2443,0.220306,0.909814
2,0.1457,0.237277,0.916888
3,0.1002,0.321613,0.917772
4,0.0465,0.463591,0.913351
5,0.0281,0.501209,0.915119
6,0.0125,0.536042,0.916004


TrainOutput(global_step=9354, training_loss=0.09990131936573896, metrics={'train_runtime': 779.9192, 'train_samples_per_second': 191.881, 'train_steps_per_second': 11.994, 'total_flos': 1774326809536152.0, 'train_loss': 0.09990131936573896, 'epoch': 6.0})

We can check with the `evaluate` method that our `Trainer` did reload the best model properly (if it was not the last one):

In [59]:
trainer.evaluate()

{'eval_loss': 0.32161253690719604,
 'eval_accuracy': 0.9177718832891246,
 'eval_runtime': 1.6505,
 'eval_samples_per_second': 685.259,
 'eval_steps_per_second': 43.018,
 'epoch': 6.0}

In [90]:
texts = ["i am happy "]
tokenized_texts = tokenizer(texts, padding=True)
tokenized_texts

{'input_ids': [[101, 1045, 2572, 3407, 102]], 'attention_mask': [[1, 1, 1, 1, 1]]}

In [91]:
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

test_dataset = SimpleDataset(tokenized_texts)

In [92]:
predictions = trainer.predict(test_dataset)
pred = predictions.predictions[0]
print(pred)
if pred[0]>pred[1]:
  print('Negative')
else:
  print('Positive')

[-3.4880176  3.07349  ]
Positive


In [72]:
def prediction(text):
  texts = [text]
  tokenized_texts = tokenizer(texts, padding=True)
  test_dataset = SimpleDataset(tokenized_texts)
  predictions = trainer.predict(test_dataset)
  print()
  pred = predictions.predictions[0]
  if pred[0]>pred[1]:
    return str(pred)+' Negative'
  else:
    return str(pred)+' Positive'

In [67]:
from IPython.display import display
from IPython.html import widgets

In [73]:
text = widgets.Text(description='Text:', width=1000, height=500)
display(text)

button = widgets.Button(description='Get Sentiment')
display(button)

def on_button_click(b):
    answer = prediction(text.value)
    print(text.value+"\n")
    print('Sentiment:', answer)
    print()
    print()
    
button.on_click(on_button_click)

Text(value='', description='Text:')

Button(description='Get Sentiment', style=ButtonStyle())


I will sue you. I am Happy. I am sad

Sentiment: [-2.5688648  2.2280385] Positive





I will sue you. I am sad

Sentiment: [ 3.065746  -2.6928205] Negative


