In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn

class Transformer(nn.Module) :
  def __init__(self, vocab_size, embed_dim=64, nhead=4, hidden_dim=128, num_classes=2) :
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    encoder_layer = nn.TransformerEncoderLayer(embed_dim, nhead, hidden_dim)
    self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
    self.classifier = nn.Linear(embed_dim, num_classes)

  def forward(self, x) :
    x = self.embedding(x)
    x = x.permute(1, 0, 2)
    x = self.transformer(x)
    x = x.mean(dim=0)
    x = self.classifier(x)
    return x

In [None]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

# IMDb의 작은 버전 사용
dataset = load_dataset("ag_news")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=128)

encoded = dataset.map(tokenize, batched=True)
encoded.set_format(type='torch', columns=['input_ids', 'label'])


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# 모델 초기화
model = Transformer(
    vocab_size=tokenizer.vocab_size,
    embed_dim=64,
    nhead=4,
    hidden_dim=128,
    num_classes=4
).to(device)

# 하이퍼파라미터 설정
batch_size = 32
num_epochs = 7
learning_rate = 1e-4

# DataLoader 설정
train_loader = DataLoader(encoded['train'], batch_size=batch_size, shuffle=True)
val_loader = DataLoader(encoded['test'], batch_size=batch_size)

# Loss, Optimizer
criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)

# 학습 루프
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids)  # (batch_size, num_classes)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        loop.set_postfix(loss=loss.item(), acc=correct/total)

    print(f"[Epoch {epoch+1}] Loss: {total_loss/len(train_loader):.4f} | Acc: {correct/total:.4f}")

# 검증
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

torch.save(model.state_dict(), "/content/drive/MyDrive/transformer_model.pt")
print(f"Validation Accuracy: {correct / total:.4f}")


cuda


Epoch 1: 100%|██████████| 3750/3750 [00:37<00:00, 100.25it/s, acc=0.686, loss=0.433]


[Epoch 1] Loss: 0.7906 | Acc: 0.6855


Epoch 2: 100%|██████████| 3750/3750 [00:35<00:00, 104.58it/s, acc=0.829, loss=0.656]


[Epoch 2] Loss: 0.4782 | Acc: 0.8290


Epoch 3: 100%|██████████| 3750/3750 [00:36<00:00, 103.63it/s, acc=0.861, loss=0.206]


[Epoch 3] Loss: 0.3952 | Acc: 0.8607


Epoch 4: 100%|██████████| 3750/3750 [00:36<00:00, 101.36it/s, acc=0.878, loss=0.317]


[Epoch 4] Loss: 0.3488 | Acc: 0.8783


Epoch 5: 100%|██████████| 3750/3750 [00:38<00:00, 98.63it/s, acc=0.889, loss=0.335] 


[Epoch 5] Loss: 0.3182 | Acc: 0.8894


Epoch 6: 100%|██████████| 3750/3750 [00:36<00:00, 102.58it/s, acc=0.898, loss=0.346]


[Epoch 6] Loss: 0.2950 | Acc: 0.8981


Epoch 7: 100%|██████████| 3750/3750 [00:36<00:00, 102.24it/s, acc=0.905, loss=0.381]


[Epoch 7] Loss: 0.2763 | Acc: 0.9051
Validation Accuracy: 0.8913


In [None]:
model = Transformer(
    vocab_size=tokenizer.vocab_size,
    embed_dim=64,
    nhead=4,
    hidden_dim=128,
    num_classes=4  # AG News는 클래스가 4개
).to(device)

model.load_state_dict(torch.load("/content/drive/MyDrive/transformer_model.pt"))
model.eval()
print("✅ 모델 불러오기 완료")

✅ 모델 불러오기 완료


In [None]:
import torch.nn.utils.prune as prune

prune.l1_unstructured(model.classifier, name="weight", amount=0.5)

print(model.classifier.weight)

tensor([[-0.0000, -0.0000, -0.0000, -0.0000, -0.1475,  0.0000,  0.0000, -0.0000,
          0.0000,  0.0000,  0.0000, -0.0000, -0.0000, -0.0000,  0.0000, -0.1873,
         -0.0000, -0.0000,  0.0000,  0.1788,  0.0000, -0.0000,  0.0000, -0.0000,
          0.0000, -0.2164,  0.0000, -0.1700, -0.0000, -0.1540, -0.1743,  0.1565,
         -0.0000, -0.0000,  0.0000,  0.0000,  0.0000,  0.1520,  0.0000, -0.0000,
          0.0000,  0.1762, -0.0000,  0.0000,  0.1516, -0.0000,  0.0000,  0.0000,
         -0.1472,  0.1786, -0.0000,  0.0000, -0.0000, -0.1709,  0.0000, -0.0000,
          0.0000,  0.0000,  0.1917, -0.0000,  0.2122, -0.0000, -0.1528,  0.0000],
        [-0.2326, -0.1442,  0.1564, -0.0000,  0.0000,  0.0000, -0.0000,  0.0000,
          0.0000, -0.1620, -0.0000, -0.0000,  0.0000,  0.1588, -0.0000, -0.0000,
          0.1931,  0.0000,  0.0000,  0.0000,  0.2041, -0.0000, -0.1855, -0.0000,
         -0.1704,  0.2104, -0.0000,  0.1974, -0.0000,  0.0000,  0.1965, -0.0000,
         -0.0000,  0.0000, 

In [None]:
# 현재 weight
dense_weight = model.classifier.weight.data  # (out_features, in_features)

# sparse tensor로 변환
sparse_weight = dense_weight.to_sparse()

# 확인
print(f"👉 Dense weight shape: {dense_weight.shape}")
print(f"👉 Sparse weight nnz (non-zero): {sparse_weight._nnz()}")

# 저장할 dict 구성
sparse_model_state = {
    'sparse_weight_indices': sparse_weight.indices(),
    'sparse_weight_values': sparse_weight.values(),
    'sparse_weight_size': sparse_weight.size(),
    'classifier_bias': model.classifier.bias.data  # bias는 dense로 저장
}

# 저장
torch.save(sparse_model_state, "/content/drive/MyDrive/pruned_transformer_model.pt")
print("✅ Sparse 모델 weight 저장 완료")

👉 Dense weight shape: torch.Size([4, 64])
👉 Sparse weight nnz (non-zero): 64
✅ Sparse 모델 weight 저장 완료


In [None]:
import os

size_before = os.path.getsize("/content/drive/MyDrive/transformer_model.pt") / 1e6
size_after = os.path.getsize("/content/drive/MyDrive/pruned_transformer_model.pt") / 1e6

print(f"💾 모델 크기 전: {size_before:.2f} MB")
print(f"💾 모델 크기 후: {size_after:.2f} MB")

💾 모델 크기 전: 8.09 MB
💾 모델 크기 후: 0.00 MB


In [None]:
# 불러오기
loaded = torch.load("/content/drive/MyDrive/pruned_transformer_model.pt")

# sparse weight 복원
sparse_tensor = torch.sparse_coo_tensor(
    indices=loaded['sparse_weight_indices'],
    values=loaded['sparse_weight_values'],
    size=loaded['sparse_weight_size']
)

# 모델에 주입 (다시 dense로 변환해서 넣어야 함)
model.classifier.weight.data = sparse_tensor.to_dense()
model.classifier.bias.data = loaded['classifier_bias']

print("✅ Sparse 저장된 weight 로드 완료")

✅ Sparse 저장된 weight 로드 완료


In [None]:
correct = 0
total = 0

model.to(device)
model.eval()

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"✅ Pruned 모델 Validation Accuracy: {accuracy:.4f}")


✅ Pruned 모델 Validation Accuracy: 0.8833
