In [1]:
import pandas as pd

df = pd.read_parquet('ADNI_cleaned.parquet', engine='fastparquet')
df

Unnamed: 0,RID,GDTOTAL,MH14BALCH,MH16BSMOK,AGE,PTGENDER,PTEDUCAT,HMHYPERT,DXAD,CLINICAL_LDL_C,bmi,apoe_2/2,apoe_2/3,apoe_2/4,apoe_3/3,apoe_3/4,apoe_4/4
0,2,0.0,0.0,0.0,74.3,0.0,16.0,0.0,0.0,1.807508,27.305574,0,0,0,1,0,0
1,3,0.0,0.0,0.0,81.3,0.0,18.0,1.0,1.0,3.193485,24.038731,0,0,0,0,1,0
2,4,0.0,0.0,0.0,67.5,0.0,10.0,1.0,0.0,2.632129,27.169082,0,0,0,1,0,0
3,5,0.0,0.0,0.0,73.7,0.0,16.0,0.0,0.0,1.397560,26.998770,0,0,0,1,0,0
4,7,0.0,0.0,0.0,75.4,0.0,10.0,1.0,1.0,3.347438,30.721924,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1873,6992,0.0,0.0,0.0,52.7,1.0,18.0,1.0,0.0,2.804546,28.161028,0,0,0,1,0,0
1874,7008,0.0,0.0,0.0,73.5,1.0,18.0,1.0,0.0,2.804546,31.848550,0,0,0,1,0,0
1875,7012,1.0,0.0,0.0,57.0,1.0,16.0,1.0,0.0,2.804546,36.603924,0,0,1,0,0,0
1876,7029,0.0,0.0,0.0,59.8,1.0,14.0,1.0,0.0,2.804546,24.210783,0,0,0,1,0,0


In [None]:
import argparse

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--fold_index", type=int, required=True)

    parser.add_argument( # this modifies the field ids
        "--experiment",
        type=str,
        help="""Experiment name. Options:
        'age' - only age
        'apoe' - apoe only 
        'all' - everything
        """,
        required = False
    )

    args = parser.parse_args()

    fold_index = args.fold_index
    experiment = args.experiment


    return fold_index, experiment

In [5]:
def settings_automl(time_budget, metric):
    """
    Generate settings for an AutoML classification task.
    Parameters:
    time_budget (int): The time budget for the AutoML process in seconds.
    metric (str): The evaluation metric to be used (e.g., 'log_loss' 'accuracy', 'f1').
    model (str): The model to be used in the AutoML process (e.g., 'lrl1').
    region_index (int): The index of the region for logging purposes.
    Returns:
    dict: A dictionary containing the settings for the AutoML process.
    """
    automl_settings = {
            "task": "classification",
            "time_budget": time_budget,
            "metric": metric,
            "n_jobs": -1,
            "eval_method": "cv",
            "n_splits": 5,
            "early_stop": True,
            "log_training_metric": True,
            "model_history": True,
            "seed": 1234321,
            "estimator_list": ['lgbm'],
    }

    return automl_settings

In [4]:
def subset_experiment_vars(df, experiment): 
    if experiment == 'age':
        df = df[['AGE', 'DXAD']]
        return df
    elif experiment == 'apoe':
        df = df[['apoe_2/2', 'apoe_2/3', 'apoe_2/4', 'apoe_3/3', 'apoe_3/4', 'apoe_4/4', 'DXAD']]
        return df
    elif experiment == 'all':
        return df

In [6]:
import os
import logging
from flaml import AutoML
import joblib
from datetime import datetime
import sys
sys.path.append('../ukb')
from doubleml_utils import subset_train_test

def main(): 
    fold_index = 0
    experiment = 'all'
    # fold_index, experiment = parse_args()

    df = pd.read_parquet('ADNI_cleaned.parquet', engine='fastparquet')
    df = subset_experiment_vars(df, experiment)

    X = df.drop(columns=['DXAD'])
    y = df['DXAD']
    
    print('splitting')
    
    results_dir = f'./flaml_results/{fold_index}'
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    
    X_train, y_train, X_test, y_test = subset_train_test(X, y, results_dir, fold_index)

    print(f'training fold {fold_index} with experiment {experiment}')
    automl = AutoML()
    automl_settings = settings_automl(300, metric="log_loss")
    print(automl_settings)

    logging.info(f"Saving the model: {datetime.now().time()}")

    automl.fit(X_train, y_train, **automl_settings)

    # save the model
    best_model = automl.model.estimator

    # Save just the best model
    joblib.dump(best_model, f"{results_dir}/flaml_best_model.joblib")

    logging.info(f"Saving the predictions: {datetime.now().time()}")
    # save the test set predictions
    y_pred = automl.predict_proba(X_test)
    results = pd.DataFrame({"y_test": y_test, "y_pred": y_pred[:,1]})
    results.to_parquet(
        f"{results_dir}/test_labels_predictions.parquet", index=False
    )

    # save the train set predictions
    y_pred = automl.predict_proba(X_train)
    results = pd.DataFrame({"y_train": y_train, "y_pred": y_pred[:,1]})
    results.to_parquet(
        f"{results_dir}/train_labels_predictions.parquet", index=False
    )

In [7]:
main()

splitting
training fold 0 with experiment all
{'task': 'classification', 'time_budget': 300, 'metric': 'log_loss', 'n_jobs': -1, 'eval_method': 'cv', 'n_splits': 5, 'early_stop': True, 'log_training_metric': True, 'model_history': True, 'seed': 1234321, 'estimator_list': ['lgbm']}
[flaml.automl.logger: 09-04 11:48:13] {1752} INFO - task = classification
[flaml.automl.logger: 09-04 11:48:13] {1763} INFO - Evaluation method: cv
[flaml.automl.logger: 09-04 11:48:13] {1862} INFO - Minimizing error metric: log_loss
[flaml.automl.logger: 09-04 11:48:13] {1979} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 09-04 11:48:13] {2282} INFO - iteration 0, current learner lgbm


[flaml.automl.logger: 09-04 11:48:13] {2417} INFO - Estimated sufficient time budget=1220s. Estimated necessary time budget=1s.
[flaml.automl.logger: 09-04 11:48:13] {2466} INFO -  at 0.2s,	estimator lgbm's best error=0.3147,	best estimator lgbm's best error=0.3147
[flaml.automl.logger: 09-04 11:48:13] {2282} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 09-04 11:48:13] {2466} INFO -  at 0.2s,	estimator lgbm's best error=0.2955,	best estimator lgbm's best error=0.2955
[flaml.automl.logger: 09-04 11:48:13] {2282} INFO - iteration 2, current learner lgbm
[flaml.automl.logger: 09-04 11:48:13] {2466} INFO -  at 0.3s,	estimator lgbm's best error=0.2955,	best estimator lgbm's best error=0.2955
[flaml.automl.logger: 09-04 11:48:13] {2282} INFO - iteration 3, current learner lgbm
[flaml.automl.logger: 09-04 11:48:13] {2466} INFO -  at 0.3s,	estimator lgbm's best error=0.2955,	best estimator lgbm's best error=0.2955
[flaml.automl.logger: 09-04 11:48:13] {2282} INFO - iteration 

KeyboardInterrupt: 

In [11]:
if __name__ == "__main__":
    main()

splitting
{'task': 'classification', 'time_budget': 3600, 'metric': 'log_loss', 'n_jobs': -1, 'eval_method': 'cv', 'n_splits': 5, 'early_stop': True, 'log_training_metric': True, 'model_history': True, 'seed': 1234321, 'estimator_list': ['lgbm']}
[flaml.automl.logger: 08-30 19:03:32] {1752} INFO - task = classification
[flaml.automl.logger: 08-30 19:03:32] {1763} INFO - Evaluation method: cv
[flaml.automl.logger: 08-30 19:03:32] {1862} INFO - Minimizing error metric: log_loss
[flaml.automl.logger: 08-30 19:03:32] {1979} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 08-30 19:03:32] {2282} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 08-30 19:03:33] {2417} INFO - Estimated sufficient time budget=770s. Estimated necessary time budget=1s.


[flaml.automl.logger: 08-30 19:03:33] {2466} INFO -  at 0.1s,	estimator lgbm's best error=0.3147,	best estimator lgbm's best error=0.3147
[flaml.automl.logger: 08-30 19:03:33] {2282} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 08-30 19:03:33] {2466} INFO -  at 0.2s,	estimator lgbm's best error=0.2955,	best estimator lgbm's best error=0.2955
[flaml.automl.logger: 08-30 19:03:33] {2282} INFO - iteration 2, current learner lgbm
[flaml.automl.logger: 08-30 19:03:33] {2466} INFO -  at 0.2s,	estimator lgbm's best error=0.2955,	best estimator lgbm's best error=0.2955
[flaml.automl.logger: 08-30 19:03:33] {2282} INFO - iteration 3, current learner lgbm
[flaml.automl.logger: 08-30 19:03:33] {2466} INFO -  at 0.3s,	estimator lgbm's best error=0.2955,	best estimator lgbm's best error=0.2955
[flaml.automl.logger: 08-30 19:03:33] {2282} INFO - iteration 4, current learner lgbm
[flaml.automl.logger: 08-30 19:03:33] {2466} INFO -  at 0.4s,	estimator lgbm's best error=0.2955,	best e

KeyboardInterrupt: 

In [None]:
rename = {
    'AGE': 'curr_age',
    'GDTOTAL': 'depression',
    'MH14BALCH': 'alcohol_consumption', # not quite right - years of alcohol consumption
    'apoe_3/3' : 'e3/e4',
    'apoe_3/4' : 'e3/e4',
    'apoe_4/4' : 'e4/e4',
    'apoe_2/4' : 'e2/e4',
    'apoe_2/3' : 'e2/e3',
    'apoe_2/2' : 'e2/e2',
}