# install library

In [None]:
!pip install -q transformers accelerate datasets



# load dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("yelp_review_full")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

# load model and tokenizer and prepare dataset for training

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True,batch_size=1000)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
id2l = { 0: "1 stars", 1: "2 stars",2: "3 stars",3:"4 stars",4: "5 stars"}
l2id ={"1 stars": 0,"2 stars": 1,"3 stars": 2,"4 stars": 3,"5 stars": 4}

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5,id2label=id2l,label2id=l2id)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# accuracy metric

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# train model

In [None]:
from transformers import TrainingArguments,Trainer

training_args = TrainingArguments(output_dir="yelp-review-classification", evaluation_strategy="epoch")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.227781,0.455
2,No log,1.0134,0.579
3,No log,1.03498,0.598


TrainOutput(global_step=375, training_loss=1.02489013671875, metrics={'train_runtime': 375.2105, 'train_samples_per_second': 7.996, 'train_steps_per_second': 0.999, 'total_flos': 789354427392000.0, 'train_loss': 1.02489013671875, 'epoch': 3.0})

# save model and load for inference

In [None]:
path = "review-classifier"
trainer.save_model(path)
tokenizer.save_pretrained(path)

('review-classifier/tokenizer_config.json',
 'review-classifier/special_tokens_map.json',
 'review-classifier/vocab.txt',
 'review-classifier/added_tokens.json',
 'review-classifier/tokenizer.json')

In [None]:
from transformers import pipeline

Pipe = pipeline("sentiment-analysis", model=path, tokenizer= path,top_k=5)


Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
review = '''
This place is great! Atmosphere is chill and cool but the staff is also really friendly.
They know what they’re doing and what they’re talking about,
and you can tell making the customers happy is their main priority.
Food is pretty good, some italian classics and some twists,
and for their prices it’s 100% worth it.

'''

In [None]:
Pipe(review)[0]

[{'label': '4 stars', 'score': 0.5979881286621094},
 {'label': '5 stars', 'score': 0.36606311798095703},
 {'label': '3 stars', 'score': 0.02637379989027977},
 {'label': '2 stars', 'score': 0.006953672971576452},
 {'label': '1 stars', 'score': 0.002621283521875739}]