In [2]:
import os 
import pandas as pd
import joblib
import numpy as np
from mlutils import calc_results
from mlutils import pick_threshold

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

root = "./results_all/"

# Get all experiment folder paths in results_all_exp with nested structure:
# root/experiment/snps/age/alz_cond/model
experiment_paths = []
for experiment in os.listdir(root):
    exp_path = os.path.join(root, experiment)
    if os.path.isdir(exp_path):
        for snps in os.listdir(exp_path):
            snps_path = os.path.join(exp_path, snps)
            if os.path.isdir(snps_path):
                for age in os.listdir(snps_path):
                    age_path = os.path.join(snps_path, age)
                    if os.path.isdir(age_path):
                        for alz_cond in os.listdir(age_path):
                            alz_path = os.path.join(age_path, alz_cond)
                            if os.path.isdir(alz_path):
                                for model in os.listdir(alz_path):
                                    model_path = os.path.join(alz_path, model)
                                    if os.path.isdir(model_path):
                                        experiment_paths.append(model_path)

def normalize_importances(importance_array):
    """
    Normalize feature importances to sum to 1 (convert to percentages)
    """
    # Ensure we don't divide by zero
    total = np.sum(importance_array)
    if total == 0:
        return np.zeros_like(importance_array)
    return importance_array / total


def get_top_features(automl):
    """
    Extracts the top features from an AutoML model.
    Parameters:
    automl (object): The AutoML model object.
    Returns:
    list: A list of the top features from the AutoML model.
    """
    if len(automl.feature_importances_) == 1:
        feature_names = np.array(automl.feature_names_in_)[
            np.argsort(abs(automl.feature_importances_[0]))[::-1]
        ]
        fi = automl.feature_importances_[0][
            np.argsort(abs(automl.feature_importances_[0]))[::-1]
        ]
    else:
        feature_names = np.array(automl.feature_names_in_)[
            np.argsort(abs(automl.feature_importances_))[::-1]
        ]
        fi = automl.feature_importances_[
            np.argsort(abs(automl.feature_importances_))[::-1]
        ]

    return feature_names, fi

def evaluate_results(root):
    summary = {} 
    features = {}
    for num in os.listdir(root): 
        if num in ["0", "1", "2", "3", "4"]:
            full = os.path.join(root, num)
            if os.path.isdir(full): 
                train = pd.read_parquet(os.path.join(full, "train_labels_predictions.parquet"), engine = 'fastparquet')
                test = pd.read_parquet(os.path.join(full, "test_labels_predictions.parquet"), engine = 'fastparquet')

                threshold = pick_threshold(train['y_train'], train['y_pred'])
            
                test_results = calc_results(test['y_test'], test['y_pred'], threshold = threshold)
            
                summary[num] = test_results

                model = joblib.load(os.path.join(full, "flaml_best_model.joblib"))

                if isinstance(model, LogisticRegression):
                    # For Logistic Regression, we need to handle coefficients
                    fi = model.coef_[0]
                    fnames = model.feature_names_in_
                elif isinstance(model, LGBMClassifier):
                    # For LGBMClassifier, we can use feature importances directly
                    fi = model.feature_importances_
                    fnames = model.feature_name_

                fi = normalize_importances(fi)
                # Create DataFrame
                df = pd.DataFrame({
                    'fi': fi,
                    'fnames': fnames
                })

                features[num] = df
    
    return summary, features

In [3]:
for path in ['./results_all/none/LDE/65up/ACD/lgbm', './results_all/none/LDE/65up/AD/lgbm', './results_all/none/only_snps/65up/ACD/lgbm/']:
    if path in experiment_paths: 
        experiment_paths.remove(path)

In [None]:
df = pd.read_parquet('./results_all/age_alone/LDE/65up/ACD/lgbm/0/train_labels_predictions.parquet', engine='fastparquet')

In [None]:
df

In [4]:
def calculate_fi_avg(merged_df):
    """
    Calculate the average feature importance across all folds and drop individual fold columns.
    """
    # Identify columns that start with 'fi_' and calculate their mean
    fi_cols = [col for col in merged_df.columns if col.startswith('fi_')]
    merged_df['avg_fi'] = merged_df[fi_cols].mean(axis=1, skipna=True)
    merged_df['std_fi'] = merged_df[fi_cols].std(axis=1, skipna=True)
    merged_df = merged_df.drop(columns=fi_cols)
    
    # some cleaning
    sorted_features = merged_df.sort_values(by='avg_fi', ascending=False)
    # only split at "_" if the name starts with "rs"
    sorted_features['fnames'] = sorted_features['fnames'].apply(
        lambda x: x.split("_")[0] if isinstance(x, str) and x.startswith("rs") else x
    )
    sorted_features['fnames'] = sorted_features['fnames'].astype('string')

    return sorted_features

In [5]:
import pandas as pd
def summarize_stats(metrics1, metrics2): 
    path1 = os.path.join(metrics1, "summary_stats/metrics.txt")
    path2 = os.path.join(metrics2, "summary_stats/metrics.txt")

    df_metrics1 = pd.read_csv(path1)
    df_metrics2 = pd.read_csv(path2)

    merged_df = df_metrics1.merge(df_metrics2, on='Unnamed: 0').round(3)

    return merged_df

def summarize_features(features1, features2):
    path1 = os.path.join(features1, "summary_stats/features.txt")
    path2 = os.path.join(features2, "summary_stats/features.txt")

    df_features1 = pd.read_csv(path1)
    df_features2 = pd.read_csv(path2)

    df_features1 = df_features1.drop_duplicates(subset=['fnames'])
    df_features2 = df_features2.drop_duplicates(subset=['fnames'])

    merged_features = pd.concat([df_features1, df_features2], axis = 1)

    return merged_features

In [6]:
def generate_feature_summary(sorted_features, path): 
    # grab snp names and add
    ad_genes = pd.read_csv('../../raw_data/MONDO_0004975_associations_export (1).tsv', sep = "\t")

    snps = ad_genes[['riskAllele', 'mappedGenes', 'pValue', 'beta', 'locations']]
    snps['snp'] = snps['riskAllele'].str.split("-").str[0]
    snps = snps.drop(columns='riskAllele')

    # separate non_snp features
    
    non_snp_feats = sorted_features[~sorted_features['fnames'].str.startswith("rs")]
    # filter by genes only present in fnames, then add to fnames
    snps = snps[snps['snp'].isin(sorted_features['fnames'])]
    sorted_features = sorted_features.merge(snps, right_on = 'snp', left_on = 'fnames')
    sorted_features = sorted_features.drop_duplicates(subset=['fnames'])

    # add non-snp features back to sorted_features
    sorted_features = pd.concat([sorted_features, non_snp_feats])
    sorted_features = sorted_features.sort_values(by='avg_fi', ascending=False)

    # Create the summary_stats directory if it doesn't exist
    summary_stats_dir = os.path.join(path, "summary_stats")
    os.makedirs(summary_stats_dir, exist_ok=True)

    #print(sorted_features.head(10))
    sorted_features.to_csv(f'{summary_stats_dir}/features.txt', index=False)


In [7]:
def generate_metrics_summary(summary, path):
    # Convert summary dictionary to DataFrame
    df_summary = pd.DataFrame(summary)
    
    # Save the summary DataFrame to a CSV file
    summary_stats_dir = os.path.join(path, "summary_stats")
    os.makedirs(summary_stats_dir, exist_ok=True)

    #print(df_summary.head())
    df_summary.to_csv(f'{summary_stats_dir}/metrics.txt', index=True)


In [8]:
#experiment_paths = ['./results/flaml_results']

def main():
    summary = None
    features = None
    
    '''
    path = './results_all_exp/age_sex_lancet2024/all_snps/65up/AD/lgbm'
    summary, features = evaluate_results(path)

    # merge data from 5 folds together
    merged_df = None
      
    for fold_name, df in features.items():
        # rename
        df_renamed = df.rename(columns={'fi': f'fi_{fold_name}'})

        if merged_df is None:
            merged_df = df_renamed
        else:
            merged_df = pd.merge(merged_df, df_renamed, on='fnames', how='outer')

    sorted_features = calculate_fi_avg(merged_df)

    generate_feature_summary(sorted_features, path)
    generate_metrics_summary(summary, path)

    '''
    for path in experiment_paths:
        #print(f"Evaluating {path}")
        summary, features = evaluate_results(path)

        # merge data from 5 folds together
        merged_df = None
        for fold_name, df in features.items():
            df_renamed = df.rename(columns={'fi': f'fi_{fold_name}'})
            if merged_df is None:
                merged_df = df_renamed
            else:
                merged_df = pd.merge(merged_df, df_renamed, on='fnames', how='outer')

        sorted_features = calculate_fi_avg(merged_df)

        generate_feature_summary(sorted_features, path)
        generate_metrics_summary(summary, path)

if __name__ == "__main__":    
    main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snps['snp'] = snps['riskAllele'].str.split("-").str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snps['snp'] = snps['riskAllele'].str.split("-").str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snps['snp'] = snps['riskAllele'].str.split("-").str[0]
A value is trying to be set on a copy of

FileNotFoundError: [Errno 2] No such file or directory: './results_all/none/only_snps/65up/ACD/lgbm/2/train_labels_predictions.parquet'

In [9]:
import os 
import pandas as pd
def summarize_stats(metrics1, metrics2): 
    path1 = os.path.join(metrics1, "summary_stats/metrics.txt")
    path2 = os.path.join(metrics2, "summary_stats/metrics.txt")

    df_metrics1 = pd.read_csv(path1)
    df_metrics2 = pd.read_csv(path2)

    merged_df = df_metrics1.merge(df_metrics2, on='Unnamed: 0').round(3)

    return merged_df

def summarize_features(features1, features2):
    path1 = os.path.join(features1, "summary_stats/features.txt")
    path2 = os.path.join(features2, "summary_stats/features.txt")

    df_features1 = pd.read_csv(path1)
    df_features2 = pd.read_csv(path2)

    df_features1 = df_features1.drop_duplicates(subset=['fnames'])
    df_features2 = df_features2.drop_duplicates(subset=['fnames'])

    merged_features = pd.concat([df_features1, df_features2], axis = 1)

    return merged_features

path0 = './results/flaml_results'
path1 = './results_all/none/LDE/allages/AD/lgbm'

stats  = summarize_stats(path0, path1).T
features = summarize_features(path0, path1)

avg_auc1 = stats.iloc[1:6, 0].astype(float).mean()
sd_auc1 = stats.iloc[1:6, 0].astype(float).std()
avg_auc2 = stats.iloc[6:11, 0].astype(float).mean()
sd_auc2 = stats.iloc[6:11, 0].astype(float).std()

print(f"Average AUROC for {path0}: {avg_auc1}, sd {sd_auc1}")
print(f"Average AUROC for {path1}: {avg_auc2}, sd {sd_auc2}")

features


Average AUROC for ./results/flaml_results: 0.8625999999999999, sd 0.005856620185738534
Average AUROC for ./results_all/none/LDE/allages/AD/lgbm: 0.7028, sd 0.011054410884348354


Unnamed: 0,fnames,avg_fi,mappedGenes,pValue,beta,locations,snp,fnames.1,avg_fi.1,std_fi,mappedGenes.1,pValue.1,beta.1,locations.1,snp.1
0,curr_age,0.187675,,,,,,rs4420638,0.061827,0.015202,"APOC1P1,APOC1",2.000000e-44,-,19:44919689,rs4420638
1,30780-0.0,0.115801,,,,,,e4/e4,0.052737,0.007573,,,,,
2,bmi,0.113926,,,,,,e3/e3,0.043336,0.016942,,,,,
3,eid,0.103333,,,,,,e3/e4,0.038288,0.015892,,,,,
4,20161-0.0,0.076082,,,,,,rs4663105,0.038265,0.005759,"BIN1,NIFKP9",4.000000e-58,-,2:127133851,rs4663105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702,,,,,,,,rs9401593,0.000000,0.000000,"EIF4EBP2P3,MIR2113",1.000000e-20,99.57901 z score increase,6:98101925,rs9401593
703,,,,,,,,rs9461540,0.000000,0.000000,"MOG,SUMO2P1",5.000000e-18,-,6:29636487,rs9461540
704,,,,,,,,rs9607782,0.000000,0.000000,EP300-AS1,3.000000e-18,-,22:41191552,rs9607782
705,,,,,,,,rs9616906,0.000000,0.000000,"Y_RNA,ARSA",3.000000e-08,40.756695 z score increase,22:50666252,rs9616906


In [None]:
[0.674, 0.69,]

In [None]:
pd.DataFrame(stats.iloc[1:6].mean()).T

In [None]:
pd.DataFrame(stats.iloc[6:].mean()).T