### Run With GPU

In [1]:
# Import libraries
import pandas as pd
import os
import fire
import torch
import json
import pandas as pd
from glob import glob
from transformers import (
    BertForMaskedLM,
    BertTokenizerFast,
    BertTokenizer,
    BertweetTokenizer,
    BertForMaskedLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    DataCollatorForLanguageModeling,
    T5Tokenizer
)
from datasets import load_dataset, Dataset
import sentencepiece

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [None]:
# Load the dataset
dataset = load_dataset("Yelp/yelp_review_full")

# Specify the number of entries you want
num_entries = 10000

# Shrink the train split
train_dataset = dataset['train'].select(range(num_entries))

# Shrink the test split
test_dataset = dataset['test'].select(range(num_entries))

# Display the number of entries in the shrunken datasets
print(f"Number of entries in the shrunken train dataset: {len(train_dataset)}")
print(f"Number of entries in the shrunken test dataset: {len(test_dataset)}")


Number of entries in the shrunken train dataset: 1000
Number of entries in the shrunken test dataset: 1000


In [None]:
# Load pretrained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Convert datasets to PyTorch format
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Prepare data loaders
train_dataloader = torch.utils.data.DataLoader(tokenized_train, batch_size=8, shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(tokenized_test, batch_size=8)

# Set up training arguments and train the model
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1000/1000 [00:00<00:00, 4532.12 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 6408.36 examples/s]
  2%|▏         | 6/375 [00:58<1:07:59, 11.06s/it]

KeyboardInterrupt: 