# 02. PyTorch 훈련 루프 완전 정복

## 학습 목표
- Autograd (자동 미분) 이해
- Loss function과 Optimizer
- 완전한 훈련 루프 구현
- 검증 및 평가

---

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Autograd - 자동 미분

**Autograd**: PyTorch의 핵심! Backpropagation을 자동으로 계산

### 1.1 기본 사용

In [None]:
# requires_grad=True: gradient 계산 활성화
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
print(f"x: {x}")
print(f"x.requires_grad: {x.requires_grad}")

# Forward pass
y = x ** 2  # y = x^2
z = y.sum()  # z = Σx^2

print(f"\ny: {y}")
print(f"z: {z}")

# Backward pass (gradient 계산)
z.backward()  # dz/dx 계산

# Gradient 확인
print(f"\ndz/dx = x.grad: {x.grad}")
print(f"Expected (2x): {2 * x.data}")

### 1.2 Computational Graph

PyTorch는 연산의 **그래프**를 만들어 역전파합니다.

```
x → (square) → y → (sum) → z
          ↓ backward ↓
grad ← (2x) ← (1) ←
```

In [None]:
# 복잡한 예제
x = torch.tensor(2.0, requires_grad=True)
w = torch.tensor(3.0, requires_grad=True)
b = torch.tensor(1.0, requires_grad=True)

# Forward: y = wx + b
y = w * x + b
print(f"y = wx + b = {y}")

# Backward
y.backward()

print(f"\ndy/dx = {x.grad}  (= w = {w.item()})")
print(f"dy/dw = {w.grad}  (= x = {x.item()})")
print(f"dy/db = {b.grad}  (= 1)")

### 1.3 Gradient Accumulation

**주의**: `.backward()`를 여러 번 호출하면 gradient가 **누적**됩니다!

In [None]:
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

# First backward
y1 = (x ** 2).sum()
y1.backward()
print(f"After 1st backward: {x.grad}")

# Second backward (누적!)
y2 = (x ** 2).sum()
y2.backward()
print(f"After 2nd backward (accumulated): {x.grad}")

# Gradient 초기화
x.grad.zero_()
print(f"After .zero_(): {x.grad}")

# 다시 계산
y3 = (x ** 2).sum()
y3.backward()
print(f"After 3rd backward (reset): {x.grad}")

## 2. 신경망 정의

**`nn.Module`**: 모든 신경망의 부모 클래스

In [None]:
class SimpleNN(nn.Module):
    """간단한 2-layer 신경망
    
    Architecture:
        Input (input_dim) → Hidden (hidden_dim) → Output (output_dim)
    """
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()  # 부모 클래스 초기화 (필수!)
        
        # Layers 정의
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # W1: (input_dim, hidden_dim), b1: (hidden_dim,)
        self.relu = nn.ReLU()  # 활성화 함수
        self.fc2 = nn.Linear(hidden_dim, output_dim)  # W2, b2
    
    def forward(self, x):
        """Forward pass
        
        Args:
            x: (batch_size, input_dim)
        
        Returns:
            out: (batch_size, output_dim)
        """
        x = self.fc1(x)  # (batch, hidden_dim)
        x = self.relu(x)  # Element-wise activation
        x = self.fc2(x)  # (batch, output_dim)
        return x

# 모델 생성
model = SimpleNN(input_dim=10, hidden_dim=20, output_dim=5)
print(model)

# Parameters 확인
print("\n=== Parameters ===")
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

# Forward pass 테스트
x_test = torch.randn(32, 10)  # 배치 32개
output = model(x_test)
print(f"\nOutput shape: {output.shape}")

## 3. Loss Function

**Loss**: 예측과 실제 값의 차이를 측정

### 주요 Loss Functions
- **Classification**: `nn.CrossEntropyLoss()`, `nn.BCELoss()`
- **Regression**: `nn.MSELoss()`, `nn.L1Loss()`

In [None]:
# Classification example
# CrossEntropyLoss = LogSoftmax + NLLLoss
criterion = nn.CrossEntropyLoss()

# 예측 (logits, softmax 전)
logits = torch.randn(4, 5)  # 배치 4, 클래스 5
# 정답 (class indices)
targets = torch.tensor([0, 2, 1, 4])  # 정답 클래스

loss = criterion(logits, targets)
print(f"Logits:\n{logits}")
print(f"\nTargets: {targets}")
print(f"\nCross-Entropy Loss: {loss.item():.4f}")

# MSE example
mse_criterion = nn.MSELoss()
predictions = torch.randn(4, 1)
targets_reg = torch.randn(4, 1)
mse_loss = mse_criterion(predictions, targets_reg)
print(f"\nMSE Loss: {mse_loss.item():.4f}")

## 4. Optimizer

**Optimizer**: Gradient를 사용하여 파라미터를 업데이트

### 주요 Optimizers
- **SGD**: Stochastic Gradient Descent
- **Adam**: Adaptive Moment Estimation (가장 많이 사용)
- **AdamW**: Adam + Weight Decay

In [None]:
# Optimizer 생성
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

print("=== Optimizer ===")
print(optimizer)
print(f"\nLearning rate: {learning_rate}")
print(f"Parameters to optimize: {sum(p.numel() for p in model.parameters())}")

# 다른 optimizer 예시
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

## 5. 완전한 훈련 루프

### 훈련 1 epoch의 구조
```python
for epoch in range(num_epochs):
    for batch in dataloader:
        # 1. Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # 2. Backward pass
        optimizer.zero_grad()  # Gradient 초기화
        loss.backward()  # Gradient 계산
        
        # 3. Update weights
        optimizer.step()
```

In [None]:
# 간단한 데이터 생성 (binary classification)
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 데이터 생성
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Tensor 변환
X_train_tensor = torch.FloatTensor(X_train).to(device)
y_train_tensor = torch.LongTensor(y_train).to(device)
X_test_tensor = torch.FloatTensor(X_test).to(device)
y_test_tensor = torch.LongTensor(y_test).to(device)

print(f"Train: {X_train_tensor.shape}, Test: {X_test_tensor.shape}")

In [None]:
# 모델, Loss, Optimizer 생성
model = SimpleNN(input_dim=20, hidden_dim=64, output_dim=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 훈련 루프
num_epochs = 100
batch_size = 32
train_losses = []
test_accuracies = []

from tqdm import tqdm

for epoch in tqdm(range(num_epochs), desc="Training"):
    model.train()  # 훈련 모드 (Dropout, BatchNorm에 영향)
    
    # Mini-batch 훈련
    epoch_loss = 0
    n_batches = 0
    
    for i in range(0, len(X_train_tensor), batch_size):
        # Batch 추출
        X_batch = X_train_tensor[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]
        
        # 1. Forward
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # 2. Backward
        optimizer.zero_grad()  # 이전 gradient 제거
        loss.backward()  # Gradient 계산
        
        # 3. Update
        optimizer.step()  # 파라미터 업데이트
        
        epoch_loss += loss.item()
        n_batches += 1
    
    # Epoch 평균 loss
    avg_loss = epoch_loss / n_batches
    train_losses.append(avg_loss)
    
    # Validation (test set)
    model.eval()  # 평가 모드
    with torch.no_grad():  # Gradient 계산 안 함 (메모리 절약)
        test_outputs = model(X_test_tensor)
        _, predicted = torch.max(test_outputs, 1)
        accuracy = (predicted == y_test_tensor).float().mean().item()
        test_accuracies.append(accuracy)
    
    # 10 epoch마다 출력
    if (epoch + 1) % 10 == 0:
        print(f"\nEpoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Test Acc: {accuracy:.4f}")

print("\n훈련 완료!")

In [None]:
# 결과 시각화
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss curve
axes[0].plot(train_losses, label='Train Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Accuracy curve
axes[1].plot(test_accuracies, label='Test Accuracy', color='orange')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Test Accuracy')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nFinal Test Accuracy: {test_accuracies[-1]:.4f}")

## 6. 모델 저장 및 로드

### State Dict
- 모델의 **파라미터**만 저장 (가벼움)
- **권장 방법**

In [None]:
# 모델 저장
torch.save(model.state_dict(), 'model.pth')
print("Model saved to model.pth")

# 모델 로드
loaded_model = SimpleNN(input_dim=20, hidden_dim=64, output_dim=2).to(device)
loaded_model.load_state_dict(torch.load('model.pth'))
loaded_model.eval()
print("Model loaded successfully")

# 검증
with torch.no_grad():
    loaded_outputs = loaded_model(X_test_tensor)
    _, loaded_predicted = torch.max(loaded_outputs, 1)
    loaded_accuracy = (loaded_predicted == y_test_tensor).float().mean().item()
    
print(f"Loaded model accuracy: {loaded_accuracy:.4f}")
print(f"Original model accuracy: {test_accuracies[-1]:.4f}")

## 7. 요약

### 훈련 루프 체크리스트
1. ✅ 데이터 준비 (Tensor 변환)
2. ✅ 모델 정의 (`nn.Module` 상속)
3. ✅ Loss function 선택
4. ✅ Optimizer 선택
5. ✅ 훈련 루프:
   - `optimizer.zero_grad()`
   - Forward pass
   - Loss 계산
   - `loss.backward()`
   - `optimizer.step()`
6. ✅ 평가 (`model.eval()`, `torch.no_grad()`)
7. ✅ 모델 저장/로드

### 주의사항
- **Gradient 초기화**: `optimizer.zero_grad()` 필수!
- **평가 모드**: `model.eval()` + `torch.no_grad()`
- **Device 일치**: 모델, 데이터 모두 같은 device

### 다음 단계
- `03_complete_project.ipynb` - 실전 프로젝트!