In [None]:
import sys
import torch
import pytorch_lightning as pl

from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint
from dotenv import load_dotenv, dotenv_values

# 하이드라와 주피터 노트북은 아규먼트 관련 충돌이 발생하므로 초기화 해줌
sys.argv = ['']
# 환경변수 읽기

load_dotenv()
if (python_path := dotenv_values().get('PYTHONPATH')) and python_path not in sys.path: sys.path.append(python_path)

from src.dataset.CvImageDatasetFastEx import get_datasets
from src.models.tune_model import TuneModel



# 데이터 준비 함수
def prepare_data(model, batch_size=32, num_workers=4):
    
   # 데이터셋 생성
    train_dataset, val_dataset, test_dataset = get_datasets(model)

    # DataLoader 정의
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True,
        drop_last=False
    )

    val_loader = DataLoader(
        val_dataset,  # 별도의 검증 데이터셋
        batch_size=batch_size,
        shuffle=False,  # 검증 시에는 셔플하지 않음
        num_workers=num_workers,
        pin_memory=True,
        drop_last=False
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        drop_last=False
    )
    
    return train_loader, val_loader, test_loader




from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from ray.tune import CLIReporter


def train_efficientnet(config):
    """콜백 없이 수동 리포팅하는 훈련 함수"""
    
    try:
        # GPU 사용 가능 여부 확인
        use_gpu = torch.cuda.is_available()
        
        # 모델 초기화
        model = TuneModel(
            model_name="tf_efficientnet_b4",
            config=config
        )
        
        # 데이터로더 생성
        train_loader, val_loader, _ = prepare_data(
            model=model, 
            batch_size=config["batch_size"], 
            num_workers=2
        )
        
        # 콜백 없는 간단한 Trainer 설정
        trainer = pl.Trainer(
            max_epochs=config["max_epochs"],
            accelerator="gpu" if use_gpu else "cpu",
            devices=1,
            # callbacks=[]  # 콜백 완전 제거
            enable_progress_bar=False,
            logger=False,
            precision="16-mixed" if use_gpu else 32
        )
        
        # 모델 훈련
        trainer.fit(model, train_loader, val_loader)
        
        # 검증 실행 및 결과 수집
        val_results = trainer.validate(model, val_loader, verbose=False)
        
        # 메트릭 추출 및 Ray Tune에 보고
        if val_results and len(val_results) > 0:
            val_loss = float(val_results[0].get('val_loss', float('inf')))
            val_acc = float(val_results[0].get('val_acc', 0.0))
        else:
            val_loss = float('inf')
            val_acc = 0.0
        
        # Ray Tune에 수동 보고
        tune.report(
            val_loss=val_loss,
            val_acc=val_acc
        )
        
        print(f"✅ Trial 완료 - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
        
    except Exception as e:
        print(f"❌ Trial 실패: {e}")
        tune.report(val_loss=float('inf'), val_acc=0.0)

def main():

    # 모델 초기화 전에 설정
    torch.set_float32_matmul_precision('medium')

    # 하이퍼파라미터 검색 공간 정의
    search_space = {
        "learning_rate": tune.loguniform(1e-5, 1e-2),
        "batch_size": tune.choice([16, 32, 64]),
        "dropout_rate": tune.uniform(0.2, 0.6),
        "weight_decay": tune.loguniform(1e-6, 1e-3),
        "num_classes": 17,  # 고정값
        "max_epochs": 50,    # 고정값

        "scheduler_type": tune.choice(["plateau", "cosine"]),
        
        # ReduceLROnPlateau 파라미터
        "patience": tune.choice([3, 5, 7, 10]),
        "factor": tune.uniform(0.1, 0.5),
        
        # CosineAnnealingLR 파라미터  
        "T_max": tune.choice([20, 30, 50]),
        "eta_min": tune.loguniform(1e-7, 1e-5),
    }

    pl.seed_everything(42)

    # ASHA 스케줄러 설정 (조기 종료로 효율성 향상)
    scheduler = ASHAScheduler(
        max_t=50,           # 최대 에포크
        grace_period=5,     # 최소 실행 에포크
        reduction_factor=2  # 절반씩 줄여가며 선택
    )
    
    # 리포터 설정 (진행상황 모니터링)
    reporter = CLIReporter(
        parameter_columns=["learning_rate", "batch_size", "dropout_rate"],
        metric_columns=["val_loss", "val_acc", "training_iteration"]
    )
    
    # Ray Tune 실행
    analysis = tune.run(
        train_efficientnet,
        config=search_space,
        metric="val_loss",           # 최적화할 메트릭
        mode="min",                 # 최대화
        scheduler=scheduler,
        progress_reporter=reporter,
        num_samples=20,             # 시도할 설정 개수
        name="efficientnet_b4_tune",
        storage_path="/data/ephemeral/home/python_work/git/gx-train/outputs/ray_results"   # 결과 저장 경로
    )
    
    # 최적 결과 출력
    best_trial = analysis.get_best_trial("val_acc", "max", "last")
    print(f"Best trial config: {best_trial.config}")
    print(f"Best trial final validation accuracy: {best_trial.last_result['val_acc']}")
    
    return analysis

if __name__ == "__main__":
    analysis = main()

In [13]:
import psutil

def check_memory():
    memory = psutil.virtual_memory()
    print(f"Total memory: {memory.total / (1024**3):.2f} GB")
    print(f"Available memory: {memory.available / (1024**3):.2f} GB")
    print(f"Used memory: {memory.used / (1024**3):.2f} GB")
    print(f"Memory percentage: {memory.percent:.1f}%")

check_memory()

Total memory: 251.62 GB
Available memory: 230.95 GB
Used memory: 18.17 GB
Memory percentage: 8.2%
