<a href="https://colab.research.google.com/github/jubin0615/DL/blob/main/HW2_1_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 19.0.1
    Uninstalling pyarrow-19.0.1:
      Successfully uninstalled pyarrow-19.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0

In [None]:
from datasets import load_dataset

# Load the CoLA dataset
cola_dataset = load_dataset("glue", "cola")

# Accessing the train, validation, and test sets
train_data = cola_dataset['train']
test_data = cola_dataset['validation']
# test_data = cola_dataset['test']

# Split the train data into new train and development sets
train_dev_split = cola_dataset['train'].train_test_split(test_size=0.1, seed=42)

# New training and development sets
train_data = train_dev_split['train']
dev_data = train_dev_split['test']



README.md: 0.00B [00:00, ?B/s]

cola/train-00000-of-00001.parquet:   0%|          | 0.00/251k [00:00<?, ?B/s]

cola/validation-00000-of-00001.parquet:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

cola/test-00000-of-00001.parquet:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from torch.optim import AdamW
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

class CoLADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data=data
        self.tokenizer=tokenizer
        self.max_length=max_length
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        item = self.data[idx]
        sentence = item['sentence']
        label = item['label']

        encoding = self.tokenizer(
            sentence,
            return_tensors='pt',
            max_length=self.max_length,
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, model_name, num_classes=2):
        super(BertClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.fc(pooled_output)
        return logits

In [None]:
train_dataset = CoLADataset(train_data, tokenizer)
dev_dataset = CoLADataset(dev_data, tokenizer)
test_dataset = CoLADataset(test_data, tokenizer)

BATCH_SIZE = 64

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = BertClassifier(model_name)
model = nn.DataParallel(model)
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

def train(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    return avg_loss

def evaluate(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            _, predicted = torch.max(logits, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

2025-12-02 07:56:28.967914: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764662189.189778      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764662189.254227      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
EPOCHS=3

for epoch in range(EPOCHS):
    print(f"\n[Epoch {epoch + 1}/{EPOCHS}]")

    train_loss = train(model, train_loader, optimizer, criterion, device)
    print(f"Training Loss: {train_loss:.4f}")

    dev_acc = evaluate(model, dev_loader, device)
    print(f"Dev Accuracy: {dev_acc:.2f}%")

print("\n" + "*"*40)
test_acc = evaluate(model, test_loader, device)
print(f"{Test Accuracy: {test_acc:.2f}%")
print("*"*40)


[Epoch 1/3]


Training: 100%|██████████| 121/121 [06:06<00:00,  3.03s/it]


Training Loss: 0.4978


Evaluating: 100%|██████████| 14/14 [00:14<00:00,  1.07s/it]


Dev Accuracy: 82.24%

[Epoch 2/3]


Training: 100%|██████████| 121/121 [06:15<00:00,  3.11s/it]


Training Loss: 0.3215


Evaluating: 100%|██████████| 14/14 [00:14<00:00,  1.07s/it]


Dev Accuracy: 84.46%

[Epoch 3/3]


Training: 100%|██████████| 121/121 [06:16<00:00,  3.11s/it]


Training Loss: 0.2001


Evaluating: 100%|██████████| 14/14 [00:14<00:00,  1.06s/it]


Dev Accuracy: 84.46%

****************************************


Evaluating: 100%|██████████| 17/17 [00:18<00:00,  1.07s/it]

2 epoch, Test Accuracy: 81.21%
****************************************



