In [2]:
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler   
from objective import objective_ml_model, objective_dl_model
from tester import test_ml_model, test_dl_model
from train_pipeline import cv_folds, run_model_pipeline
import os
import joblib

In [11]:
#=============You can directly load the model for prediction
scaler_path = "./out/model/scaler/scaler.save"
model_path = "./out/model/logreg/best_model.pth"
data_path = "./data/mouse_glioma/data_split/test_with_labels.csv"  #You can replace it with your own data that you wish to predict.
df_test = pd.read_csv(data_path).head(5)
X_test = df_test.drop(columns=['label', 'ID'], errors='ignore').values
y_test = df_test['label'].values if 'label' in df_test.columns else None
ids = df_test['ID'].values if 'ID' in df_test.columns else None
scaler = joblib.load(scaler_path)
X_test_scaled = scaler.transform(X_test)

model = joblib.load(model_path)
preds = model.predict(X_test_scaled)
print("Prediction Result:", preds) #0:negative #1:positive

Prediction Result: [0 0 0 0 0]


In [3]:
#=============Or you can train your own model and make predictions
if __name__ == "__main__":
    # You can replace it with your own dataset
    data_split_save = "./data/mouse_glioma"
    df_train = pd.read_csv(f"{data_split_save}/train_with_labels.csv")
    df_test = pd.read_csv(f"{data_split_save}/test_with_labels.csv")
    X_train = df_train.drop(columns=['label', 'ID']).values  
    X_test = df_test.drop(columns=['label', 'ID']).values  
    y_train = df_train['label'].values 
    y_test = df_test['label'].values
    groups = df_train['ID'].values
  
    print("Training set size:", len(X_train))
    print("Test set size:", len(X_test))  

    save_path = "./out"

    folds = cv_folds(X_train, y_train, groups, n_splits=5, save_dir=f"{data_split_save}/cv_folds")
    # Taking ML logistic regression and DL CNN as examples respectively
    model_list = [
        {"model_type": "ml", "model_name": "logreg", "objective_fn": objective_ml_model, "test_fn": test_ml_model},
        #{"model_type": "dl", "model_name": "cnn", "objective_fn": objective_dl_model, "test_fn": test_dl_model},
    ]

    results = {}
    SEED = 1
    device = torch.device("cuda:5") 
    for m in model_list:
        print(f"\nðŸ”§ Running model: {m['model_name']} ({m['model_type']})")

        result = run_model_pipeline(
            X_train,
            y_train,
            X_test,
            y_test,
            folds,
            model_type=m["model_type"],
            model_name=m["model_name"],
            objective_fn=m["objective_fn"],
            test_fn=m["test_fn"],
            search_space=None,  # You can also define your own search space for each model
            n_trials=50,
            score_metric="auroc",
            save_path=save_path,
            device=device,
            seed=SEED
        )
        print(f'{m["model_name"]}:{result}')
        results[m["model_name"]] = result

    print("\nðŸ“Š The results of all models: ")
    for name, res in results.items():
        print(f"{name}: {res}")   

Training set size: 44454
Test set size: 9297
[âœ“] Saved 5-fold data has been detected and will be loaded directly

ðŸ”§ Running model: logreg (ml)
A new study created in memory with name: search_for_logreg
Trial 0 finished with value: 0.9489835368437343 and parameters: {'C': 0.04656804637919568}. Best is trial 0 with value: 0.9489835368437343.
Trial 1 finished with value: 0.9493037518375795 and parameters: {'C': 0.7608481233714796}. Best is trial 1 with value: 0.9493037518375795.
Trial 2 finished with value: 0.9498888622951951 and parameters: {'C': 0.001001053986051049}. Best is trial 2 with value: 0.9498888622951951.
Trial 3 finished with value: 0.9493764012802588 and parameters: {'C': 0.016193110912440738}. Best is trial 2 with value: 0.9498888622951951.
Trial 4 finished with value: 0.9498457609866573 and parameters: {'C': 0.00386387940515873}. Best is trial 2 with value: 0.9498888622951951.
Trial 5 finished with value: 0.9499519091678039 and parameters: {'C': 0.0023407464805767528}

In [6]:
# The DL operation takes longer, so it is recommended to run it via the command line.
# Run on the terminal:
"python train_pipeline.py"