In [1]:
import os 
import pandas as pd
import joblib
import numpy as np
from mlutils import calc_results
from mlutils import pick_threshold

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

experiment_paths = ["./flaml_results"]


def normalize_importances(importance_array):
    """
    Normalize feature importances to sum to 1 (convert to percentages)
    """
    # Ensure we don't divide by zero
    total = np.sum(importance_array)
    if total == 0:
        return np.zeros_like(importance_array)
    return importance_array / total


def get_top_features(automl):
    """
    Extracts the top features from an AutoML model.
    Parameters:
    automl (object): The AutoML model object.
    Returns:
    list: A list of the top features from the AutoML model.
    """
    if len(automl.feature_importances_) == 1:
        feature_names = np.array(automl.feature_names_in_)[
            np.argsort(abs(automl.feature_importances_[0]))[::-1]
        ]
        fi = automl.feature_importances_[0][
            np.argsort(abs(automl.feature_importances_[0]))[::-1]
        ]
    else:
        feature_names = np.array(automl.feature_names_in_)[
            np.argsort(abs(automl.feature_importances_))[::-1]
        ]
        fi = automl.feature_importances_[
            np.argsort(abs(automl.feature_importances_))[::-1]
        ]

    return feature_names, fi

def evaluate_results(root):
    summary = {} 
    features = {}
    for num in os.listdir(root): 
        if num in ["0", "1", "2", "3", "4"]:
            full = os.path.join(root, num)
            if os.path.isdir(full): 
                train = pd.read_parquet(os.path.join(full, "train_labels_predictions.parquet"), engine = 'fastparquet')
                test = pd.read_parquet(os.path.join(full, "test_labels_predictions.parquet"), engine = 'fastparquet')

                threshold = pick_threshold(train['y_train'], train['y_pred'])
            
                test_results = calc_results(test['y_test'], test['y_pred'], threshold = threshold)
            
                summary[num] = test_results

                model = joblib.load(os.path.join(full, "flaml_best_model.joblib"))

                if isinstance(model, LogisticRegression):
                    # For Logistic Regression, we need to handle coefficients
                    fi = model.coef_[0]
                    fnames = model.feature_names_in_
                elif isinstance(model, LGBMClassifier):
                    # For LGBMClassifier, we can use feature importances directly
                    fi = model.feature_importances_
                    fnames = model.feature_name_

                fi = normalize_importances(fi)
                # Create DataFrame
                df = pd.DataFrame({
                    'fi': fi,
                    'fnames': fnames
                })

                features[num] = df
    
    return summary, features

In [8]:
evaluate_results(experiment_paths)

({'0': auroc          0.702427
  avg_prec       0.183266
  threshold      0.140298
  TP            16.000000
  TN           258.000000
  FP            79.000000
  FN            23.000000
  accuracy       0.728723
  bal_acc        0.587918
  prec_n         0.918149
  prec_p         0.168421
  recall_n       0.765579
  recall_p       0.410256
  f1_n           0.834951
  f1_p           0.238806
  mcc            0.123378
  dtype: float64,
  '1': auroc          0.793883
  avg_prec       0.260849
  threshold      0.132745
  TP            31.000000
  TN           239.000000
  FP            98.000000
  FN             8.000000
  accuracy       0.718085
  bal_acc        0.752035
  prec_n         0.967611
  prec_p         0.240310
  recall_n       0.709199
  recall_p       0.794872
  f1_n           0.818493
  f1_p           0.369048
  mcc            0.323739
  dtype: float64,
  '2': auroc          0.742677
  avg_prec       0.282454
  threshold      0.132425
  TP            20.000000
  TN         

In [2]:
def calculate_fi_avg(merged_df):
    """
    Calculate the average feature importance across all folds and drop individual fold columns.
    """
    # Identify columns that start with 'fi_' and calculate their mean
    fi_cols = [col for col in merged_df.columns if col.startswith('fi_')]
    merged_df['avg_fi'] = merged_df[fi_cols].mean(axis=1, skipna=True)
    merged_df = merged_df.drop(columns=fi_cols)
    
    # some cleaning
    sorted_features = merged_df.sort_values(by='avg_fi', ascending=False)
    # only split at "_" if the name starts with "rs"
    sorted_features['fnames'] = sorted_features['fnames'].apply(
        lambda x: x.split("_")[0] if isinstance(x, str) and x.startswith("rs") else x
    )
    sorted_features['fnames'] = sorted_features['fnames'].astype('string')

    return sorted_features

In [3]:
import pandas as pd
def summarize_stats(metrics1, metrics2): 
    path1 = os.path.join(metrics1, "summary_stats/metrics.txt")
    path2 = os.path.join(metrics2, "summary_stats/metrics.txt")

    df_metrics1 = pd.read_csv(path1)
    df_metrics2 = pd.read_csv(path2)

    merged_df = df_metrics1.merge(df_metrics2, on='Unnamed: 0').round(3)

    return merged_df

def summarize_features(features1, features2):
    path1 = os.path.join(features1, "summary_stats/features.txt")
    path2 = os.path.join(features2, "summary_stats/features.txt")

    df_features1 = pd.read_csv(path1)
    df_features2 = pd.read_csv(path2)

    df_features1 = df_features1.drop_duplicates(subset=['fnames'])
    df_features2 = df_features2.drop_duplicates(subset=['fnames'])

    merged_features = pd.concat([df_features1, df_features2], axis = 1)

    return merged_features

In [4]:
def generate_feature_summary(sorted_features, path): 
    # grab snp names and add
    ad_genes = pd.read_csv('../../raw_data/MONDO_0004975_associations_export (1).tsv', sep = "\t")

    snps = ad_genes[['riskAllele', 'mappedGenes', 'pValue', 'beta', 'locations']]
    snps['snp'] = snps['riskAllele'].str.split("-").str[0]
    snps = snps.drop(columns='riskAllele')

    # separate non_snp features
    
    non_snp_feats = sorted_features[~sorted_features['fnames'].str.startswith("rs")]
    # filter by genes only present in fnames, then add to fnames
    snps = snps[snps['snp'].isin(sorted_features['fnames'])]
    sorted_features = sorted_features.merge(snps, right_on = 'snp', left_on = 'fnames')
    sorted_features = sorted_features.drop_duplicates(subset=['fnames'])

    # add non-snp features back to sorted_features
    sorted_features = pd.concat([sorted_features, non_snp_feats])
    sorted_features = sorted_features.sort_values(by='avg_fi', ascending=False)

    # Create the summary_stats directory if it doesn't exist
    summary_stats_dir = os.path.join(path, "summary_stats")
    os.makedirs(summary_stats_dir, exist_ok=True)

    #print(sorted_features.head(10))
    sorted_features.to_csv(f'{summary_stats_dir}/features.txt', index=False)


In [5]:
def generate_metrics_summary(summary, path):
    # Convert summary dictionary to DataFrame
    df_summary = pd.DataFrame(summary)
    
    # Save the summary DataFrame to a CSV file
    summary_stats_dir = os.path.join(path, "summary_stats")
    os.makedirs(summary_stats_dir, exist_ok=True)

    #print(df_summary.head())
    df_summary.to_csv(f'{summary_stats_dir}/metrics.txt', index=True)


In [6]:
def main():
    summary = None
    features = None
    
    '''
    path = './results_all_exp/age_sex_lancet2024/all_snps/65up/AD/lgbm'
    summary, features = evaluate_results(path)

    # merge data from 5 folds together
    merged_df = None
      
    for fold_name, df in features.items():
        # rename
        df_renamed = df.rename(columns={'fi': f'fi_{fold_name}'})

        if merged_df is None:
            merged_df = df_renamed
        else:
            merged_df = pd.merge(merged_df, df_renamed, on='fnames', how='outer')

    sorted_features = calculate_fi_avg(merged_df)

    generate_feature_summary(sorted_features, path)
    generate_metrics_summary(summary, path)

    '''
    for path in experiment_paths:
        print(f"Evaluating {path}")
        summary, features = evaluate_results(path)

        # merge data from 5 folds together
        merged_df = None
        for fold_name, df in features.items():
            df_renamed = df.rename(columns={'fi': f'fi_{fold_name}'})
            if merged_df is None:
                merged_df = df_renamed
            else:
                merged_df = pd.merge(merged_df, df_renamed, on='fnames', how='outer')

        sorted_features = calculate_fi_avg(merged_df)

        generate_feature_summary(sorted_features, path)
        generate_metrics_summary(summary, path)

if __name__ == "__main__":    
    main()

Evaluating ./flaml_results


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snps['snp'] = snps['riskAllele'].str.split("-").str[0]


In [7]:
path = './flaml_results'
summary, features = evaluate_results(path)

# merge data from 5 folds together
merged_df = None
for fold_name, df in features.items():
    df_renamed = df.rename(columns={'fi': f'fi_{fold_name}'})
    if merged_df is None:
        merged_df = df_renamed
    else:
        merged_df = pd.merge(merged_df, df_renamed, on='fnames', how='outer')

sorted_features = calculate_fi_avg(merged_df)

generate_feature_summary(sorted_features, path)
generate_metrics_summary(summary, path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snps['snp'] = snps['riskAllele'].str.split("-").str[0]


In [1]:
import os 
import pandas as pd
def summarize_stats(metrics1, metrics2): 
    path1 = os.path.join(metrics1, "summary_stats/metrics.txt")
    path2 = os.path.join(metrics2, "summary_stats/metrics.txt")

    df_metrics1 = pd.read_csv(path1)
    df_metrics2 = pd.read_csv(path2)

    merged_df = df_metrics1.merge(df_metrics2, on='Unnamed: 0').round(3)

    return merged_df

def summarize_features(features1, features2):
    path1 = os.path.join(features1, "summary_stats/features.txt")
    path2 = os.path.join(features2, "summary_stats/features.txt")

    df_features1 = pd.read_csv(path1)
    df_features2 = pd.read_csv(path2)

    df_features1 = df_features1.drop_duplicates(subset=['fnames'])
    df_features2 = df_features2.drop_duplicates(subset=['fnames'])

    merged_features = pd.concat([df_features1, df_features2], axis = 1)

    return merged_features

path0 = 'flaml_results'
path1 = 'flaml_results'

stats  = summarize_stats(path0, path1).T
features = summarize_features(path0, path1)

avg_auc1 = stats.iloc[1:6, 0].astype(float).mean()
sd_auc1 = stats.iloc[1:6, 0].astype(float).std()
avg_auc2 = stats.iloc[6:11, 0].astype(float).mean()
sd_auc2 = stats.iloc[6:11, 0].astype(float).std()

print(f"Average AUROC for {path0}: {avg_auc1}, sd {sd_auc1}")
print(f"Average AUROC for {path1}: {avg_auc2}, sd {sd_auc2}")

features


Average AUROC for flaml_results: 0.7455999999999999, sd 0.045313353440238825
Average AUROC for flaml_results: 0.7455999999999999, sd 0.045313353440238825


Unnamed: 0,fnames,avg_fi,mappedGenes,pValue,beta,locations,snp,fnames.1,avg_fi.1,mappedGenes.1,pValue.1,beta.1,locations.1,snp.1
0,MH16BSMOK,0.17062,,,,,,MH16BSMOK,0.17062,,,,,
1,CLINICAL_LDL_C,0.167976,,,,,,CLINICAL_LDL_C,0.167976,,,,,
2,bmi,0.162582,,,,,,bmi,0.162582,,,,,
3,AGE,0.144017,,,,,,AGE,0.144017,,,,,
4,PTEDUCAT,0.142195,,,,,,PTEDUCAT,0.142195,,,,,
5,apoe_3/3,0.057643,,,,,,apoe_3/3,0.057643,,,,,
6,apoe_2/3,0.047803,,,,,,apoe_2/3,0.047803,,,,,
7,apoe_4/4,0.041209,,,,,,apoe_4/4,0.041209,,,,,
8,GDTOTAL,0.024695,,,,,,GDTOTAL,0.024695,,,,,
9,HMHYPERT,0.01984,,,,,,HMHYPERT,0.01984,,,,,


In [None]:
[0.674, 0.69,]

In [None]:
pd.DataFrame(stats.iloc[1:6].mean()).T

In [None]:
pd.DataFrame(stats.iloc[6:].mean()).T