In [9]:
from src.aasp.data_handler.data_handler import AASPConfig, AASPDataHandler
from src.aasp.data_handler.aasp_dataset import AASPDataset

config_path = "config.yaml" # Change this to config file path as needed and revert pathing in AASPConfig in data_handler.py
cfg = AASPConfig(config_path)
handler = AASPDataHandler(cfg)

In [10]:
selected_features = [
    k for k, v in getattr(cfg, "features", {}).items()
    if v and k in {"ref_embedding", "alt_embedding", "biotype", "consequence", "ref_long", "alt_long", "scoreset"}
]
cat_config = getattr(cfg, "categorical_config", {})


In [11]:
records = handler.load_pickle(cfg.file_path)
val_frac = cfg.parameters.get("val_frac", 0.15)
val_size = int(len(records) * val_frac)
train_records = records[:len(records) - val_size]
val_records = records[len(records) - val_size:]


Loading data from: c:\Users\ryanp\Personal Projects\mlm25_mavedb\data\train\combined_train_data.pkl


AttributeError: 'AASPConfig' object has no attribute 'parameters'

In [None]:
vocabs = {k: handler.fit_vocab(train_records, k) for k in cat_config}
for k, vocab in vocabs.items():
    print(f"Vocab for {k}: {len(vocab)} classes")


In [None]:
train_dataset = AASPDataset(
    config_path=config_path,
    fields=selected_features,
    fuse_mode=cfg.fuse_mode,
    embed_metric=cfg.embed_metric,
    categorical_config=cat_config
)
val_dataset = AASPDataset(
    config_path=config_path,
    fields=selected_features,
    fuse_mode=cfg.fuse_mode,
    embed_metric=cfg.embed_metric,
    categorical_config=cat_config
)
train_dataset.records = train_records
val_dataset.records = val_records

from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=cfg.hyperparameters["train_batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=cfg.hyperparameters["val_batch_size"], shuffle=False)

X, y = train_dataset[0]
print(f"Sample feature vector shape: {X.shape}")
print(f"Sample label: {y}")
print(f"Batch feature shape: {next(iter(train_loader))[0].shape}")



Actual Model Training Example
================================

In [None]:
from training.src.aasp.model.models import BaselineModel
from training.src.aasp.model.trainer import Trainer

cat_dims = {k: (len(vocabs[k]), 4) for k, typ in cat_config.items() if typ == "embedding"}
multi_hot_dims = {k: len(vocabs[k]) for k, typ in cat_config.items() if typ == "multi_hot"}

model = BaselineModel(
    input_dim=1,  # If "distance"
    cat_dims=cat_dims,
    multi_hot_dims=multi_hot_dims,
    hidden_dims=tuple(cfg.hyperparameters["hidden_dims"]),
    dropout_rates=tuple(cfg.hyperparameters["dropout_rates"])
)
print(model)
total_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {total_params}")


Dumb Model Training Example
================================

In [None]:
from training.src.aasp.model.models import DumbModel
from training.src.aasp.model.trainer import Trainer

cat_dims = {k: (len(vocabs[k]), 4) for k, typ in cat_config.items() if typ == "embedding"}
multi_hot_dims = {k: len(vocabs[k]) for k, typ in cat_config.items() if typ == "multi_hot"}

model = DumbModel(
    input_dim=1,  # If "distance"
    cat_dims=cat_dims,
    multi_hot_dims=multi_hot_dims,
    hidden_dims=tuple(cfg.hyperparameters["hidden_dims"]),
    dropout_rates=tuple(cfg.hyperparameters["dropout_rates"])
)
print(model)
total_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {total_params}")


In [None]:
import torch
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.hyperparameters["learning_rate"])
loss_fn = torch.nn.MSELoss()

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    train_loader=train_loader,
    val_loader=val_loader,
    num_epochs=cfg.hyperparameters["num_epochs"],
    save_path="output/baseline_model_best.pth",
    device= "cuda" if torch.cuda.is_available() else "cpu" # Change to "cuda" if GPU is available
)

In [None]:
trainer.run()