In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import Transformer

# 모델 정의
class LLaMAModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers):
        super(LLaMAModel, self).__init__()
        self.transformer = Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers)
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src, tgt = self.embedding(src), self.embedding(tgt)
        output = self.transformer(src, tgt)
        return self.fc(output)

# 하이퍼파라미터 설정
vocab_size = 10000
d_model = 512
nhead = 8
num_encoder_layers = 6
num_decoder_layers = 6
learning_rate = 0.001
epochs = 10

# 모델 및 옵티마이저 초기화
model = LLaMAModel(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# 임의의 데이터 생성
src = torch.randint(0, vocab_size, (10, 32))  # 10 sequences of length 32
tgt = torch.randint(0, vocab_size, (10, 32))
tgt_output = torch.randint(0, vocab_size, (10, 32))

# 학습
for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(src, tgt)
    loss = criterion(output.view(-1, vocab_size), tgt_output.view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# 결과 출력
print("Sample output:", torch.argmax(output[0], dim=1))

Epoch 1/10, Loss: 9.3626
Epoch 2/10, Loss: 8.6779
Epoch 3/10, Loss: 7.9754
Epoch 4/10, Loss: 7.3647
Epoch 5/10, Loss: 6.7910
Epoch 6/10, Loss: 6.4936
Epoch 7/10, Loss: 6.1352
Epoch 8/10, Loss: 5.9113
Epoch 9/10, Loss: 5.7506
Epoch 10/10, Loss: 5.5866
Sample output: tensor([9052, 1197, 1197, 9721, 9721, 3964, 1548, 1982, 3964, 1982, 9538, 9538,
        1197, 1197, 1982, 1982, 4443, 1982, 1197, 9721, 9538, 9538, 9721, 9721,
        6934, 9538, 8393, 1982, 9538, 1197, 9721, 9538])
