<a href="https://colab.research.google.com/github/jamesbaskerville/colabs/blob/main/YelpReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [3]:
dataset = load_dataset('yelp_review_full')

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [5]:
tokenizer_checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)



In [6]:
def tokenizer_fn(ex):
  return tokenizer(ex["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(
    tokenizer_fn,
    batched=True
)

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

In [7]:
# prep dataset for training
try:
  tokenized_datasets = tokenized_datasets.remove_columns(["text"])
  tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
  tokenized_datasets.set_format('torch')
except:
  print('Already complete')
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [8]:
# use smaller dataset for fine-tuning
small_train_dataset = tokenized_datasets['train'].shuffle(seed=99).select(range(1000))
small_test_dataset = tokenized_datasets['test'].shuffle(seed=99).select(range(1000))
small_train_dataset, small_test_dataset

(Dataset({
     features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 1000
 }),
 Dataset({
     features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 1000
 }))

In [9]:
# DataLoader to handle batches of data
from torch.utils.data import DataLoader
train_dl = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
test_dl = DataLoader(small_test_dataset, batch_size=8)
len(train_dl), len(test_dl)

(125, 125)

In [10]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Optimizer
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [12]:
# learning rate scheduler
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    name='linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps = num_training_steps
)

In [13]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')
model.to(device)
print(device)

cuda


In [14]:
print(device)

cuda


In [15]:
# Actual training loop
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

# training mode
model.train()
for epoch in range(num_epochs):
  for batch in train_dl:
    batch = { k:v.to(device) for k,v in batch.items() }
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/375 [00:00<?, ?it/s]

In [19]:
import evaluate
# metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
metric = evaluate.load('accuracy')

# evaluation mode
model.eval()
for batch in test_dl:
  batch = { k:v.to(device) for k,v in batch.items() }
  with torch.no_grad():
    outputs = model(**batch)

  logits = outputs.logits
  predictions = torch.argmax(logits, dim=-1)
  metric.add_batch(predictions=predictions, references=batch['labels'])

metric.compute()



{'accuracy': 0.554}