In [22]:
!pip install transformers datasets scikit-learn pandas numpy matplotlib



In [34]:
!pip install --upgrade transformers



In [27]:
!pip install evaluate



In [23]:
from datasets import load_dataset

dataset = load_dataset("amazon_polarity", split="train[:20000]")
print(dataset)


Dataset({
    features: ['label', 'title', 'content'],
    num_rows: 20000
})


In [24]:
import re

def clean_text(text):
    text = str(text)
    text = text.replace("\n", " ").replace("\r", " ")
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^A-Za-z0-9\s.,!?]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip().lower()

In [25]:
df=dataset.to_pandas()[['content', 'label']]
df['clean_text'] = df['content'].apply(clean_text)
display(df.head(10))

Unnamed: 0,content,label,clean_text
0,This sound track was beautiful! It paints the ...,1,this sound track was beautiful! it paints the ...
1,I'm reading a lot of reviews saying that this ...,1,im reading a lot of reviews saying that this i...
2,This soundtrack is my favorite music of all ti...,1,this soundtrack is my favorite music of all ti...
3,I truly like this soundtrack and I enjoy video...,1,i truly like this soundtrack and i enjoy video...
4,"If you've played the game, you know how divine...",1,"if youve played the game, you know how divine ..."
5,I am quite sure any of you actually taking the...,1,i am quite sure any of you actually taking the...
6,"This is a self-published book, and if you want...",0,"this is a selfpublished book, and if you want ..."
7,I loved Whisper of the wicked saints. The stor...,1,i loved whisper of the wicked saints. the stor...
8,I just finished reading Whisper of the Wicked ...,1,i just finished reading whisper of the wicked ...
9,This was a easy to read book that made me want...,1,this was a easy to read book that made me want...


In [26]:
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

dataset_hf=Dataset.from_pandas(df[['clean_text', 'label']])

def tokenize_function(examples):
    return tokenizer(
        examples["clean_text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_dataset=dataset_hf.map(tokenize_function, batched=True)

print(tokenized_dataset[0])

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

{'clean_text': 'this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen!', 'label': 1, 'input_ids': [101, 2023, 2614, 2650, 2001, 3376, 999, 2009, 23262, 1996, 12411, 7301, 1999, 2115, 2568, 2061, 2092, 1045, 2052, 28667, 8462, 4859, 2009, 2130, 2000, 2111, 2040, 5223, 6819, 2094, 1012, 2208, 2189, 999, 1045, 2031, 2209, 1996, 2208, 10381, 4948, 2080, 2892, 2021, 2041, 1997, 2035, 1997, 1996, 2399, 1045, 2031, 2412, 2209, 2009, 2038, 1996, 2190, 2189, 999, 2009, 10457, 2185, 2013, 13587, 9019, 2075, 1998, 3138, 1037, 4840, 2121, 3357, 2007, 24665, 3686, 7334, 1998, 3969, 3993, 19505, 1012, 2009, 2052, 17894, 3087, 2040, 14977, 2000, 4952, 999, 102, 0, 0, 0, 

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import numpy as np
import evaluate
from datasets import Dataset


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)
def load_data_in_chunks():

    texts = ["Sample text " + str(i) for i in range(1000)]
    labels = [i % 2 for i in range(1000)]

    return Dataset.from_dict({"text": texts, "label": labels})


print("Loading data...")
dataset = load_data_in_chunks()


print("Tokenizing data...")
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=100)


split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)


print("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)


accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=None,
    push_to_hub=False,
    dataloader_pin_memory=False,
    dataloader_num_workers=2,
    gradient_accumulation_steps=2,
)


import torch
torch.cuda.empty_cache() if torch.cuda.is_available() else None

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Starting training with memory optimizations...")
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading data...
Tokenizing data...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Loading model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Starting training with memory optimizations...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.301249,0.925
2,No log,0.214881,0.915


TrainOutput(global_step=100, training_loss=0.42846538543701174, metrics={'train_runtime': 281.0276, 'train_samples_per_second': 5.693, 'train_steps_per_second': 0.356, 'total_flos': 4933332288000.0, 'train_loss': 0.42846538543701174, 'epoch': 2.0})