Kernel: Python 3 (ipykernel)

fine-tune a pre-trained model from Hugging Face to perform sentiment analysis of restaurant reviews.

In [1]:
from datasets import load_dataset
 
dataset = load_dataset("yelp_polarity")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 560000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 38000
    })
})


In [2]:
train_dataset = dataset['train']
print(train_dataset[0])

{'text': "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.", 'label': 0}


filter the dataset to include only rows containing the word "restaurant"

In [3]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]
 
restaurant_train_reviews = train_dataset.filter(
    lambda x: "restaurant" in x["text"].lower()
)
 
restaurant_test_reviews = test_dataset.filter(
    lambda x: "restaurant" in x["text"].lower()
)
 
number_of_reviews = 5000
subset_train_reviews = restaurant_train_reviews.shuffle(
    seed = 42).select(range(number_of_reviews))
subset_test_reviews = restaurant_test_reviews.shuffle(
    seed = 42).select(range(number_of_reviews))
 
subset_dataset = {
    "train": subset_train_reviews,
    "test": subset_test_reviews
}
 
from datasets import DatasetDict
yelp_restaurant_dataset = DatasetDict(subset_dataset)
 
print(yelp_restaurant_dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})


In [4]:
yelp_restaurant_dataset['train'][0]

{'text': 'My girlfriend and I have been wanting to come here for awhile, we finally came & we had the worst experience ever. We asked our server for a few minutes to look over the menu & he never came back. 15 minutes later, someone finally came and took our order. We waited awhile and when they brought our food, they got the whole order wrong. My girlfriend ordered soup and it never came out. Worst service ever. Would not recommend this restaurant to anyone.',
 'label': 0}

In [5]:
from transformers import AutoTokenizer
 
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 
def tokenize_function(examples):
    return tokenizer(examples["text"], 
                     padding = "max_length", 
                     truncation = True, 
                     max_length = 512)
 
tokenized_datasets = yelp_restaurant_dataset.map(
                         tokenize_function, 
                         batched=True)
tokenized_datasets

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
})

Load pre-trained model

Sequence classification tasks involve assigning a single label or category to an entire sequence of data, such as a sentence, paragraph, or even a longer sequence of tokens.

In [6]:
from transformers import AutoModelForSequenceClassification
import torch
 
model = AutoModelForSequenceClassification.from_pretrained(
            model_checkpoint, num_labels = 2)
 
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")
 
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [7]:
# pip install transformers[torch]

In [8]:
# pip install 'accelerate>=0.26.0'

In [9]:
from transformers import Trainer, TrainingArguments
 
training_args = TrainingArguments(
    output_dir = "./results",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 3,
    weight_decay = 0.01,
    logging_dir = "./logs",
    logging_steps = 10,
    save_steps = 500,
    load_best_model_at_end = True,
)
 
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["test"],
)
                    
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.125,0.164542
2,0.0265,0.190826
3,0.0477,0.210543




TrainOutput(global_step=939, training_loss=0.13487510114939957, metrics={'train_runtime': 470.7512, 'train_samples_per_second': 31.864, 'train_steps_per_second': 1.995, 'total_flos': 1987010979840000.0, 'train_loss': 0.13487510114939957, 'epoch': 3.0})

In [None]:
model.save_pretrained("./results/final_model")
tokenizer.save_pretrained("./results/final_tokenizer")
save the fine-tuned model and tokenizer