<a href="https://colab.research.google.com/github/hail-members/distributed-deep-learning/blob/main/distributed_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data parallelism

In [None]:
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor

# 데이터를 생성 (난수)
data = np.random.rand(10_000_000)

# 점진적 평균 계산 함수 (각 스레드에서 처리)
def calculate_mean_incremental(data_chunk):
    mean = 0
    for i, x in enumerate(data_chunk, start=1):
        mean += (x - mean) / i  # 점진적 평균 업데이트
    return mean, len(data_chunk)

# 멀티 쓰레드를 사용한 병렬 점진적 평균 계산 함수
def calculate_parallel_mean(data, num_threads=4):
    # 데이터를 num_threads 개수만큼 나누기
    chunk_size = len(data) // num_threads
    chunks = [data[i * chunk_size:(i + 1) * chunk_size] for i in range(num_threads)]

    # 평균 값을 병렬로 계산
    total_sum = 0
    total_count = 0
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = {executor.submit(calculate_mean_incremental, chunk): chunk for chunk in chunks}

        for future in futures:
            chunk_mean, chunk_size = future.result()
            total_sum += chunk_mean * chunk_size  # 각 조각의 평균을 전체 크기에 맞춰 합산
            total_count += chunk_size

    return total_sum / total_count  # 최종 평균 계산

# 성능 테스트
def performance_test():
    # 병렬 처리 없이 점진적 평균 계산
    start_time = time.time()
    mean_result, _ = calculate_mean_incremental(data)
    non_parallel_time = time.time() - start_time

    # 멀티 쓰레드를 사용한 병렬 평균 계산
    start_time = time.time()
    parallel_mean_result = calculate_parallel_mean(data, num_threads=8)
    parallel_time = time.time() - start_time

    # 결과와 시간 비교 출력
    print(f"Without parallelism: Mean = {mean_result}, Time = {non_parallel_time:.4f} seconds")
    print(f"With parallelism (4 threads): Mean = {parallel_mean_result}, Time = {parallel_time:.4f} seconds")
    print(f"Speedup: {non_parallel_time / parallel_time:.2f}x")

# 성능 테스트 실행
performance_test()


Without parallelism: Mean = 0.5001536871694922, Time = 4.7696 seconds
With parallelism (4 threads): Mean = 0.5001536871695091, Time = 3.4092 seconds
Speedup: 1.40x


MLP training

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from concurrent.futures import ThreadPoolExecutor

# MLP 모델 정의
class SimpleMLP(nn.Module):
    def __init__(self):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(28*28, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(-1, 28*28)  # Flatten the input
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# MNIST 데이터 로딩
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)

# 일반 학습 방식
def train_simple(model, device, train_loader, optimizer, epoch):
    model.train()
    criterion = nn.CrossEntropyLoss()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)}]\tLoss: {loss.item():.6f}")

# CPU에서 학습
device = torch.device("cpu")

# 일반 학습 시간 측정
model_simple = SimpleMLP().to(device)
optimizer_simple = optim.SGD(model_simple.parameters(), lr=0.01, momentum=0.9)

start_time_simple = time.time()
train_simple(model_simple, device, train_loader, optimizer_simple, epoch=1)
end_time_simple = time.time()

simple_training_time = end_time_simple - start_time_simple


print(f"Simple training time: {simple_training_time:.4f} seconds")

Train Epoch: 1 [0/60000]	Loss: 2.296316
Train Epoch: 1 [6400/60000]	Loss: 0.475918
Train Epoch: 1 [12800/60000]	Loss: 0.208591
Train Epoch: 1 [19200/60000]	Loss: 0.395367
Train Epoch: 1 [25600/60000]	Loss: 0.143054
Train Epoch: 1 [32000/60000]	Loss: 0.423407
Train Epoch: 1 [38400/60000]	Loss: 0.155348
Train Epoch: 1 [44800/60000]	Loss: 0.100124
Train Epoch: 1 [51200/60000]	Loss: 0.195753
Train Epoch: 1 [57600/60000]	Loss: 0.160608
Simple training time: 19.5761 seconds


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import threading
from queue import Queue
import time

# MLP 모델 정의
class SimpleMLP(nn.Module):
    def __init__(self):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(28*28, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(-1, 28*28)  # Flatten the input
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# A3C 워커 클래스 정의
class Worker(threading.Thread):
    def __init__(self, global_model, optimizer, train_loader, device, queue):
        threading.Thread.__init__(self)
        self.global_model = global_model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.device = device
        self.queue = queue
        self.local_model = SimpleMLP().to(self.device)
        self.local_model.load_state_dict(self.global_model.state_dict())  # 글로벌 모델 복제
        self.criterion = nn.CrossEntropyLoss()

    def run(self):
        for batch_idx, (data, target) in enumerate(self.train_loader):
            data, target = data.to(self.device), target.to(self.device)

            # 로컬 모델로 학습
            self.optimizer.zero_grad()
            output = self.local_model(data)
            loss = self.criterion(output, target)
            loss.backward()

            # 글로벌 모델 업데이트
            with torch.no_grad():
                for global_param, local_param in zip(self.global_model.parameters(), self.local_model.parameters()):
                    global_param.grad = local_param.grad  # 글로벌 모델에 로컬 그래디언트 적용
                self.optimizer.step()

            # 로스 출력 및 글로벌 모델 저장
            if batch_idx % 100 == 0:
                print(f"Worker {threading.current_thread().name} - Batch {batch_idx} Loss: {loss.item():.4f}")

# 멀티 스레드 학습
def training(global_model, train_loader, num_workers=4):
    optimizer = optim.SGD(global_model.parameters(), lr=0.01, momentum=0.9)
    queue = Queue()
    workers = []

    # 각 워커 스레드 생성 및 시작
    for i in range(num_workers):
        worker = Worker(global_model, optimizer, train_loader, torch.device("cpu"), queue)
        worker.start()
        workers.append(worker)

    # 각 워커의 완료를 기다림
    for worker in workers:
        worker.join()

# CPU에서 학습
device = torch.device("cpu")

# 임의 데이터 생성 (MNIST 대신 사용)
num_samples = 60000  # 데이터 샘플 수
num_classes = 10  # 클래스 수

X_data = torch.randn(num_samples, 1, 28, 28)  # 랜덤 이미지 데이터
y_data = torch.randint(0, num_classes, (num_samples,))  # 랜덤 레이블 데이터

# DataLoader로 변환
train_dataset = TensorDataset(X_data, y_data)
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)

# 글로벌 모델 생성 및 분산 학습 실행
global_model = SimpleMLP().to(device)

start_time = time.time()
training(global_model, train_loader, num_workers=4)
end_time = time.time()

print(f"distributed training time: {end_time - start_time:.4f} seconds")


Worker Thread-23 - Batch 0 Loss: 2.2992
Worker Thread-24 - Batch 0 Loss: 2.3366
Worker Thread-26 - Batch 0 Loss: 2.3325
Worker Thread-25 - Batch 0 Loss: 2.3304
Worker Thread-24 - Batch 100 Loss: 2.3026
Worker Thread-23 - Batch 100 Loss: 2.2897
Worker Thread-25 - Batch 100 Loss: 2.2978
Worker Thread-26 - Batch 100 Loss: 2.3156
Worker Thread-24 - Batch 200 Loss: 2.3023
Worker Thread-25 - Batch 200 Loss: 2.3279
Worker Thread-23 - Batch 200 Loss: 2.2960
Worker Thread-26 - Batch 200 Loss: 2.3019
Worker Thread-24 - Batch 300 Loss: 2.2990
Worker Thread-25 - Batch 300 Loss: 2.3066
Worker Thread-26 - Batch 300 Loss: 2.3111
Worker Thread-23 - Batch 300 Loss: 2.3196
Worker Thread-24 - Batch 400 Loss: 2.3292
Worker Thread-25 - Batch 400 Loss: 2.3094
Worker Thread-23 - Batch 400 Loss: 2.3149
Worker Thread-26 - Batch 400 Loss: 2.3131
Worker Thread-24 - Batch 500 Loss: 2.3117
Worker Thread-25 - Batch 500 Loss: 2.3063
Worker Thread-23 - Batch 500 Loss: 2.3036
Worker Thread-26 - Batch 500 Loss: 2.3187
