<a href="https://colab.research.google.com/github/kla55/transformer/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
! pip install transformers datasets torch



In [7]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("imdb")  # Example: IMDB movie reviews

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [37]:
from transformers import AutoTokenizer

# Use a pre-trained tokenizer (e.g., DistilBERT)
# The AutoTokenizer.from_pretrained method in the Hugging Face Transformers library is a powerful tool for loading pre-trained tokenizers.
# This method simplifies the process of converting text data into a format that can be fed into models for various NLP tasks.
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the text data
# tokenizer(examples["text"]) converts the input text into numerical token IDs.
# The tokenizer returns a dictionary with:
# input_ids: The list of token IDs representing the text.
# attention_mask: A binary mask indicating which tokens are meaningful (1) and which are padding tokens (0). The model uses this to ignore padding during computation.z
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [39]:
from transformers import AutoModelForSequenceClassification

# a method from the Hugging Face Transformers library used to load a pre-trained transformer model specifically designed
# for sequence classification tasks (e.g., sentiment analysis, spam detection, or topic classification).
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True)
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)


In [43]:
from torch.optim import AdamW
import torch

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Use CrossEntropyLoss for binary classification
loss_fn = torch.nn.CrossEntropyLoss()

In [44]:
from tqdm import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Training loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    train_loss = 0

    for batch in tqdm(train_dataloader):
        print(batch)
        # Move batch to device
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)
    print(f"Training Loss: {avg_train_loss:.4f}")


Epoch 1/3


100%|██████████| 1563/1563 [19:26<00:00,  1.34it/s]


Training Loss: 0.2700
Epoch 2/3


100%|██████████| 1563/1563 [19:28<00:00,  1.34it/s]


Training Loss: 0.1390
Epoch 3/3


100%|██████████| 1563/1563 [19:28<00:00,  1.34it/s]

Training Loss: 0.0717





In [47]:
from sklearn.metrics import accuracy_score

# The eval() method puts the model into evaluation mode - it disables dropout layers
model.eval()
all_preds = []
all_labels = []

# Disables gradient computation
with torch.no_grad():
    # Iterates over test data
    for batch in eval_dataloader:
        # Extracts all input tensors (e.g., input_ids, attention_mask) and sends them to the same device as the model.
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        # Extracts the ground-truth labels from the batch and moves them to the device.
        labels = batch["labels"].to(device)

        # Runs the model on the input data.
        # Outputs a logits tensor with shape (batch_size, num_classes).
        # Example: If there are 3 classes, the logits might look like:
        outputs = model(**inputs)
        # Chooses the class with the highest logit value for each sample in the batch.
        preds = torch.argmax(outputs.logits, dim=1)

        # Moves predictions and labels to the CPU for easier manipulation and storage.
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.9204


In [50]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Positive" if prediction == 1 else "Negative"

# Example prediction
print(predict("The movie was fantastic!"))

Positive
