In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from ray import tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CheckpointConfig

import sys
sys.path.append('/home/mei/nas/docker/thesis/model_train')


In [3]:
import gc
gc.collect()

import torch

torch.cuda.empty_cache()

In [4]:
import ray
ray.init()
print(ray.available_resources()) 

2025-03-05 23:02:41,706	INFO worker.py:1841 -- Started a local Ray instance.


{'node:__internal_head__': 1.0, 'CPU': 64.0, 'memory': 767823092736.0, 'accelerator_type:RTX': 1.0, 'node:192.168.60.144': 1.0, 'GPU': 8.0, 'object_store_memory': 200000000000.0}


In [5]:
import ray
ray.shutdown()
ray.init(runtime_env={"working_dir": "/home/mei/nas/docker/thesis/model_train"})
ray.init(ignore_reinit_error=True, _temp_dir="/home/mei/nas/docker/thesis/data/ray_results")


2025-03-05 23:02:47,782	INFO worker.py:1841 -- Started a local Ray instance.
2025-03-05 23:02:47,849	INFO packaging.py:575 -- Creating a file package for local module '/home/mei/nas/docker/thesis/model_train'.
2025-03-05 23:02:47,910	INFO packaging.py:367 -- Pushing file package 'gcs://_ray_pkg_e6becee7b608e4df.zip' (1.73MiB) to Ray cluster...
2025-03-05 23:02:47,917	INFO packaging.py:380 -- Successfully pushed file package 'gcs://_ray_pkg_e6becee7b608e4df.zip'.
2025-03-05 23:02:49,002	INFO worker.py:1672 -- Calling ray.init() again after it has already been called.


0,1
Python version:,3.10.14
Ray version:,2.43.0


In [6]:
from dataloader.ts_reader import MultiModalDataset, collate_fn_pre_train
from model.autoencoder_ts import TimeSeriesAutoencoder

In [20]:
def train_autoencoder(config, train_loader, val_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = TimeSeriesAutoencoder(
        input_dim=config["input_dim"],
        hidden_dim=config["hidden_dim"],
    ).to(device)
    
    criterion = nn.L1Loss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])
    
    best_val_loss = float("inf")
    model_dir= "/home/mei/nas/docker/thesis/data/model/pre_train_autoencoder"  
    os.makedirs(model_dir, exist_ok=True)
    best_model_path = os.path.join(model_dir, "best_model.pth")
    
    for epoch in range(config["epochs"]):
        model.train()
        for batch in train_loader:
            inputs, lengths = batch
            inputs = inputs.to(device)
            lengths = lengths.to(device)
            optimizer.zero_grad()
            outputs, _ = model(inputs, lengths)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                inputs, lengths = batch
                inputs = inputs.to(device)
                lengths = lengths.to(device)
                outputs, _ = model(inputs, lengths)
                loss = criterion(outputs, inputs)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        print(f"Epoch {epoch+1}/{config['epochs']}, Validation Loss: {val_loss:.4f}")
        tune.report({"val_loss": val_loss})
        
        if epoch % 5 == 0 or epoch == config["epochs"] - 1:  # 最后一个 epoch
            checkpoint_dir = tune.make_checkpoint_dir()  # 创建检查点目录
            checkpoint_path = os.path.join(checkpoint_dir, "checkpoint.pt")
            torch.save({
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "epoch": epoch,
                "val_loss": val_loss,
            }, checkpoint_path)
            tune.report(val_loss=val_loss, checkpoint=checkpoint_path)  # 报告指标和检查点
        else:
            tune.report(val_loss=val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_path)
    
    print(f"Best validation loss: {best_val_loss}")
    
    

In [21]:
def tune_autoencoder(train_loader, val_loader):
    search_space = {
        "input_dim": 324, 
        "hidden_dim": tune.choice([32, 64]),  
        "lr": tune.loguniform(1e-5,1e-4, 1e-3),
        "epochs": 10,
    }
    
    algo = OptunaSearch()
    scheduler = ASHAScheduler(
        max_t=10,  
        grace_period=5,  
        reduction_factor=2, 
    )
    tuner = tune.Tuner(
        tune.with_resources(
        tune.with_parameters(train_autoencoder, train_loader=train_loader, val_loader=val_loader),
        resources={"cpu": 4, "gpu": 2}  
    ),
        tune_config=tune.TuneConfig(
            metric="val_loss",
            mode="min",
            search_alg=algo,
            num_samples=6,  # number of hyperparameter configurations to try
            scheduler=scheduler,
        ),
        run_config=tune.RunConfig(
            stop={"training_iteration": 10},  # 
            name="pre_train_autoencoder", 
            storage_path="/home/mei/nas/docker/thesis/data/ray_results/pre_train_autoencoder",
        ),
        param_space=search_space,
    )

    results = tuner.fit()
    best_config = results.get_best_result().config

    print("Best Hyperparameters:", best_config)
    return best_config


In [18]:
train_data_dir = "/home/mei/nas/docker/thesis/data/hdf/train"
val_data_dir = "/home/mei/nas/docker/thesis/data/hdf/val"

lstm_dataset_train = MultiModalDataset(train_data_dir)
lstm_dataset_val = MultiModalDataset(val_data_dir)

lstm_loader_train = DataLoader(lstm_dataset_train, batch_size=32, shuffle=True, collate_fn=collate_fn_pre_train)
lstm_loader_val = DataLoader(lstm_dataset_val, batch_size=32, shuffle=False,collate_fn=collate_fn_pre_train)

In [None]:
best_params = tune_autoencoder(lstm_loader_train,lstm_loader_val)

0,1
Current time:,2025-03-05 23:21:40
Running for:,00:00:10.21
Memory:,107.8/1007.7 GiB

Trial name,status,loc,hidden_dim,lr
train_autoencoder_3ae7b6a2,RUNNING,192.168.60.144:3832605,64,2.45833e-05
train_autoencoder_22356e2d,RUNNING,192.168.60.144:3832775,64,7.13113e-05
train_autoencoder_2c2df124,PENDING,,32,2.02861e-05


In [None]:
best_params = tune_autoencoder(lstm_loader_train, lstm_loader_val, restore_path="/home/mei/nas/docker/thesis/data/ray_results/pre_train_autoencoder")

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")   
model = TimeSeriesAutoencoder(
    input_dim=324,
    hidden_dim=32,
    lstm_layers=2,
    dropout=0
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.L1Loss() 