# Text Classification using Transformers (BERT/DistilBERT)
This notebook demonstrates text classification using a pre-trained transformer model.

Expected CSV format:
- `text` column: input text
- `label` column: class labels (integers or strings)

In [None]:
# Install required libraries (uncomment if not installed)
# !pip install transformers datasets torch scikit-learn matplotlib

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

## Load Dataset
Upload your CSV file with `text` and `label` columns.

In [None]:
from google.colab import files
uploaded = files.upload()

for fn in uploaded.keys():
    df = pd.read_csv(fn)

print('Dataset shape:', df.shape)
df.head()

In [None]:
# Encode labels
le = LabelEncoder()
labels_encoded = le.fit_transform(df['label'].values)
num_classes = len(np.unique(labels_encoded))
print('Number of classes:', num_classes)

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['text'].values, labels_encoded, test_size=0.2, random_state=42
)

## Tokenization using Transformers

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 128

In [None]:
# Create PyTorch Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Create datasets and dataloaders
train_dataset = TextDataset(X_train, y_train, tokenizer, max_length)
test_dataset = TextDataset(X_test, y_test, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

## Build Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
model = model.to(device)

## Training Loop

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
from torch.nn import CrossEntropyLoss
criterion = CrossEntropyLoss()

num_epochs = 3
train_losses = []
train_accuracies = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    train_losses.append(total_loss / len(train_loader))
    train_accuracies.append(correct / total)
    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}, Accuracy: {correct/total:.4f}')

## Evaluation

In [None]:
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
print('Test Accuracy:', acc)

In [None]:
# Plot training accuracy
plt.figure(figsize=(6,4))
plt.plot(train_accuracies, label='train_accuracy')
plt.plot(train_losses, label='train_loss')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.legend()
plt.title('Training Accuracy and Loss')
plt.show()