In [None]:
# note this is PART of the code from get_model_info_ALL.ipynb

import pandas as pd
import yaml
import numpy as np

### 1. Get performance info
# Get top performing models
df = pd.read_csv('../../../08_manuscript/featureSetML_TCGA/src/classifier_metrics_20220511/big_results_matrix.tsv', sep='\t', low_memory=False)

platform_options = {
    'CF':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'All', 'OVERALL'],
    'AKLIMATE':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'MULTI', 'TOP'], 
    'skgrid':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'OVERALL'],
    'subSCOPE':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'OVERALL'],
    'jadbio':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'MULTI']

}

groups = ['AKLIMATE', 'CF', 'jadbio', 'skgrid', 'subSCOPE']
cancers = ['BRCA', 'LGGGBM', 'COADREAD', 'SKCM', 'ACC', 'BLCA', 'CESC', 'ESCC', 'GEA', 'HNSC', 'KIRCKICH', 'KIRP', 'LIHCCHOL', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'SARC', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UVM']
pmetric = 'overall_weighted_f1'
filters = 100 # max size of ft list
    
best = {}
for cancer in cancers: 
    print(cancer)
    # For a group: select best model
    ct = 1
    cancer_dict = {}
    for group in groups:
        # remove rows where no feature list len provided
        # note this might be a spot need to fix later
        df = df[df['total_features']!= '__NO_LIST__'].reset_index(drop=True)
        # change type of col
        df['total_features'] = df['total_features'].astype('int')

        # Grabs all models for: method, cancer, overall_weighted_f1
        subset = df[df['feature_list_method'] == group]
        subset = subset[subset['cohort'] == cancer]
        subset = subset[subset['performance_metric'] == pmetric].reset_index(drop=True)
        # filter for max ft size
        max_ft_size = int(filters)
        subset = subset[subset['total_features'] <= max_ft_size].reset_index(drop=True)
        subset = subset.sort_values(by='Mean', ascending=False)

        # Get the model for each platform
        platforms = platform_options[group]
        for platform in platforms:
            col_options = ['GEXP_features','CNVR_features','MIR_features','MUTA_features','METH_features']

            # filter models specific to group
            if group == 'CF':
                if platform != 'OVERALL':
                    # CF specific. featureid has info of platform at index 3
                    # note: "All" is at index2 (CF_ACC_All_Top_100). must use all datatypes
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[2]==platform]
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                # no filtering done.  "OVERALL" is the best single or multi data type model.
                else:
                    platform_subset = subset
            elif group == 'AKLIMATE':
                print(platform)
                # AKLIMATE specific. featureid has info of platform at index 1
                # single platform is AKLIMATE_METH_ONLY_BRCA_reduced_model_5_feature_set_BRCA
                if platform in ['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR']:    
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[1]==platform]
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                # # "MULTI" can use all platform is AKLIMATE_BRCA_reduced_model_100_feature_set_BRCA
                elif platform =='MULTI':
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[2]!='ONLY']
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                # no filtering done.  "TOP" is the best single or multi data type model.
                elif platform == 'TOP':
                    platform_subset = subset 
            elif group =='skgrid':
                if platform in ['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR']:
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[3]=='perplatform'+platform]
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                elif platform =='OVERALL':
                    # no filtering done.  "OVERALL" is the best single or multi data type model.
                    platform_subset = subset  
            elif group == 'subSCOPE':
                # note we do not have a statement for "ENSEMBLE" which can use multiple datatypes
                if platform in ['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR']:
                    if platform == 'CNVR':
                        ftid_keep = [f for f in subset['featureID'] if f.split('_')[0]=='subSCOPE-'+platform.replace('R', '')]
                    else:
                        ftid_keep= [f for f in subset['featureID'] if f.split('_')[0]=='subSCOPE-'+platform]
                    platform_subset= subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                elif platform =='OVERALL':
                    # no filtering done. where OVERALL is the best single or multi data type model
                    platform_subset = subset  
            elif group == 'jadbio':
                if platform in ['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR']:    
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[2]==platform]
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                elif platform =='MULTI':
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[2]=='MULTIDATATYPE']
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                elif group== 'OVERALL':
                    # no filtering done. where OVERALL is the best single or multi data type model
                    platform_subset = subset  


            # Grab name of model wit ht eh highest MEWAN performance metic (overall balanced f1)
            platform_subset = platform_subset.sort_values(by='Mean', ascending=False).reset_index(drop=True)
            # if found at least one model
            if platform_subset.shape[0] > 0:
                ftID = platform_subset.sort_values(by='Mean', ascending=False).reset_index(drop=True)['featureID'][0]
            #else no models fitting the above filters, need to finish
            else:
                ftID = 'NO_MODEL_MATCH_' + group
            # note can grab 'model' col at this point too if want, it will be located at row 0

            # save
            if group not in cancer_dict:
                cancer_dict[group] ={platform:ftID}
                # add new extra info
                if ftID.startswith('NO_MODEL_MATCH_'):
                    cancer_dict[group]['info_'+platform]= {'Mean_'+pmetric: np.nan, 
                     'Std_'+pmetric: np.nan, 
                     'Max_'+pmetric:np.nan,
                     'full_featureID': np.nan,
                     'full_model':np.nan,
                    }   
                else:
                    cancer_dict[group]['info_'+platform]= {'Mean_'+pmetric: platform_subset['Mean'][0], 
                     'Std_'+pmetric: platform_subset['Std'][0],
                     'Max_'+pmetric: platform_subset['Max'][0],
                     'full_featureID': platform_subset['featureID'][0],
                     'full_model':platform_subset['model'][0],
                    }                
            else:
                cancer_dict[group][platform] =ftID
                # add new extra info
                if ftID.startswith('NO_MODEL_MATCH_'):
                    cancer_dict[group]['info_'+platform]= {'Mean_'+pmetric:np.nan, 
                     'Std_'+pmetric: np.nan, 
                     'Max_'+pmetric:np.nan,
                     'full_featureID': np.nan,
                     'full_model':np.nan,
                    }   
                else:
                    cancer_dict[group]['info_'+platform]= {'Mean_'+pmetric: platform_subset['Mean'][0], 
                     'Std_'+pmetric: platform_subset['Std'][0], 
                     'Max_'+pmetric: platform_subset['Max'][0],
                     'full_featureID': platform_subset['featureID'][0],
                     'full_model':platform_subset['model'][0],
                    }      
            print(ftID, ' selected as best model for group')
            
    best[cancer]=cancer_dict



# save this ref file 
with open('../tools/options_extended.yml', 'w') as fh:
    yaml.dump(best, fh)

In [None]:
best['BRCA']['skgrid']['info_OVERALL']

# Pause to update '../tools/model_info.json' so contains 'OVERALL' etc
    


In [None]:
# what has been added 
# OVERALL - cloudforest


# pull up info used in docker models
with open('../tools/model_info.json', 'r') as file:
    info = yaml.safe_load(file)

In [None]:
best['BRCA']['skgrid']['info_OVERALL']

In [None]:
platform_options = {
    'CF':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'All', 'OVERALL'],
    'AKLIMATE':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'MULTI', 'TOP'], 
    'skgrid':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'OVERALL'],
    'subSCOPE':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'OVERALL'],
    'jadbio':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'MULTI']

}
nameconvert = {
    'aklimate':'AKLIMATE', 
    'cloudforest':'CF',
    'jadbio':'jadbio',
    'skgrid':'skgrid',
    'subscope':'subSCOPE'
}


#####
cancer = 'BRCA'
####
# Add in info for "OVERALL"
for algor in ['skgrid', 'cloudforest', 'aklimate', 'subscope', 'jadbio']:
    if 'OVERALL' in platform_options[nameconvert[algor]]:
        print(algor)

#         # Sanity check: get all platforms for that cancer-algor combo in "best"
#         platforms = [a for a in list(best[cancer][nameconvert[algor]].keys()) if 'info' not in a]
#         for plat in platforms:
#             # if not in info_dict then we want to add it
#             if plat not in info[algor][cancer]:
#                 print(plat, 'not in info_dict for', algor, cancer)


        # determing which platform has the best score for that cancer-algor combo
        opts = [a for a in best[cancer][nameconvert[algor]].keys() if 'info' in a]
        print(opts)
        high_score = 0
        high_platform = 'NA'
        for op in opts:
            score = best[cancer][nameconvert[algor]][op]['Mean_overall_weighted_f1']
            print(high_score, op)
            if score >high_score:
                high_score = score
                high_platform = op.split('_')[1]
                print('updated to', high_platform, high_score)
        # add that meta data info of high_platfrom as the 'OVERALL' or 'TOP'
        assert high_platform!='NA'
        print('OVERALL winner was', high_platform)
        info[algor][cancer]['OVERALL']=info[algor][cancer][high_platform]
        print()

In [None]:
# TODO need to check why jadbio-brca-overall isn't showing gexp as the winner,
# shown in the updated dict is MULTI...
best['BRCA']['jadbio']['info_METH']['Max_overall_weighted_f1']

# best[cancer][nameconvert[algor]][op]['Mean_overall_weighted_f1']

In [None]:
import yaml
import pandas as pd 

### 2. Build summary table
with open('../tools/options_extended.yml', 'r') as fh2:
    best = yaml.load(fh2, Loader=yaml.Loader)
    
# pull up info used in docker models
with open('../tools/model_info.json', 'r') as file:
    info = yaml.safe_load(file)
    
nameconvert = {
    'aklimate':'AKLIMATE', 
    'cloudforest':'CF',
    'jadbio':'jadbio',
    'skgrid':'skgrid',
    'subscope':'subSCOPE'
}

res = {'Cohort': [], 'Platform': [], 'Algorithm_Method':[], 'Max_Overall_Weighted_F1':[], 'Mean_Overall_Weighted_F1':[],'Features':[]}

for algor in info.keys():
    for cancer in info[algor].keys():
        for platform in info[algor][cancer].keys():

            res['Cohort'].append(cancer)
            res['Platform'].append(platform)
            res['Algorithm_Method'].append(algor)
            fts_list = info[algor][cancer][platform]['fts']
#             fts_list = ', '.join([a.split(':')[3] + '('+ a.split(':')[4]+')' for a in fts_list])
            fts_list = ','.join(fts_list)
            res['Features'].append(fts_list)
        
            perf = float(best[cancer][nameconvert[algor]]['info_'+platform]['Max_overall_weighted_f1'])
            res['Max_Overall_Weighted_F1'].append(perf)
            
            perf = float(best[cancer][nameconvert[algor]]['info_'+platform]['Mean_overall_weighted_f1'])
            res['Mean_Overall_Weighted_F1'].append(perf)      
summary = pd.DataFrame.from_dict(res)
summary = summary.sort_values(by = ['Cohort','Platform','Max_Overall_Weighted_F1', 'Algorithm_Method'], ascending =[True,False, False, False]).reset_index(drop=True)
# remove the max overall weighted f1 column
summary = summary.drop('Max_Overall_Weighted_F1', axis=1)

In [None]:
summary.to_csv('../tools/table_models.tsv', sep='\t', index=False)

# Option 2
dependent on running the last section of option 1

In [None]:
# Table 1: Rows = data type, columns = cohort, values = single top algorithm (ex. SK Grid, AKLIMATE, etc) with mean overall_weighted_f1
res_2 = {'OVERALL':[],'TOP':[],'CNVR':[], 'GEXP':[], 'METH':[], 'MIR':[], 'MULTI':[], 'MUTA':[], 'All':[]}
index_list = []
cancers_list = sorted(list(set(summary['Cohort'])))


for cancer in cancers_list:
    for platform in res_2.keys():
        # Get the best model for cohort-platform (pick best model of the 5 algors)
        s1 = summary[(summary['Platform']==platform)&(summary['Cohort']==cancer)].sort_values(by='Mean_Overall_Weighted_F1', ascending =False).reset_index(drop=True)
        try:
            model = s1['Algorithm_Method'][0]
            perf= s1['Mean_Overall_Weighted_F1'][0]
            fts = s1['Features'][0]
        # if no hits
        except:
            model = 'NA'
            perf='NA'
            fts='NA'

        # build out table (transposed)
        res_2[platform].append(str(model + ' ('+str(perf)+')'))
    index_list.append(cancer)
    
summary_2 = pd.DataFrame.from_dict(res_2)
summary_2.index = index_list
summary_2.to_csv('../tools/table_models.option2a.tsv', sep='\t', index=True)

In [None]:
# Table 2: Rows = data type, columns = cohort, values = feature list
res_2 = {'OVERALL':[],'TOP':[],'CNVR':[], 'GEXP':[], 'METH':[], 'MIR':[], 'MULTI':[], 'MUTA':[], 'All':[]}
index_list = []
cancers_list = sorted(list(set(summary['Cohort'])))


for cancer in cancers_list:
    for platform in res_2.keys():
        # Get the best model for cohort-platform (pick best model of the 5 algors)
        s1 = summary[(summary['Platform']==platform)&(summary['Cohort']==cancer)].sort_values(by='Mean_Overall_Weighted_F1', ascending =False).reset_index(drop=True)
        try:
            model = s1['Algorithm_Method'][0]
            perf= s1['Features'][0]
            fts = s1['Features'][0]
        # if no hits
        except:
            model = 'NA'
            perf='NA'
            fts='NA'

        # build out table (transposed)
        res_2[platform].append(perf)
    index_list.append(cancer)
    
summary_2 = pd.DataFrame.from_dict(res_2)
summary_2.index = index_list
summary_2.to_csv('../tools/table_models.option2b.tsv', sep='\t', index=True)

# Option 3
dependent on running the last 2 sections

In [None]:
# Table 1: Rows = data type, columns = cohort, values = single top algorithm (ex. SK Grid, AKLIMATE, etc) with mean overall_weighted_f1
res_2 = {'OVERALL':[],'TOP':[],'CNVR':[], 'GEXP':[], 'METH':[], 'MIR':[], 'MULTI':[], 'MUTA':[], 'All':[]}
index_list = []
cancers_list = sorted(list(set(summary['Cohort'])))


for cancer in cancers_list:
    for platform in res_2.keys():
        # Get the best model for cohort-platform (pick best model of the 5 algors)
        s1 = summary[(summary['Platform']==platform)&(summary['Cohort']==cancer)].sort_values(by='Mean_Overall_Weighted_F1', ascending =False).reset_index(drop=True)
        try:
            model = s1['Algorithm_Method'][0]
            perf= s1['Mean_Overall_Weighted_F1'][0]
            fts = s1['Features'][0]
        # if no hits
        except:
            model = 'NA'
            perf='NA'
            fts='NA'

        # build out table (transposed)
        res_2[platform].append(str(model + ' ('+str(perf)+') ')+ fts)
    index_list.append(cancer)
    
summary_2 = pd.DataFrame.from_dict(res_2)
summary_2.index = index_list
summary_2.to_csv('../tools/table_models.option3.tsv', sep='\t', index=True)

# debug

options as per github readme

skgrid: OVERALL, (CNVR, GEXP, METH, MIR, MUTA)

aklimate: TOP, MULTI, (GEXP, CNVR, METH) ( MIR AND MUTA???)

cloudforest: OVERALL, All, (CNVR, GEXP, METH, MIR, MUTA)

subscope: allcohorts, (CNVR, GEXP, METH, MIR, MUTA)

jabio: MULTI, (CNVR, GEXP, METH, MIR, MUTA)

In [None]:
# t2 = df[df['cohort']=='BRCA']
# t2 = t2[t2['performance_metric']=='overall_weighted_f1'].sort_values(by='Mean', ascending =False)
# t2

t2 = summary[summary['Cohort']=='BRCA']
t2 = t2[t2['Platform']=='OVERALL'].sort_values(by='Mean_Overall_Weighted_F1', ascending =False)
t2

In [None]:
summary

In [None]:
loli = pd.read_csv('/Users/leejor/Downloads/LollipopPlotInput.txt', sep='\t')
loli

In [None]:
from collections import Counter
Counter(loli['feature_list_method'])

In [None]:
summary[(summary['Cohort']=='BRCA')&(summary['Algorithm_Method']=='skgrid')]

In [None]:
summary[(summary['Platform']=='OVERALL')&(summary['Algorithm_Method']=='skgrid')]

# data from chirstina

In [None]:
# what has been added 
# OVERALL - cloudforest
import pandas as pd
import yaml

nameconvert = {
    'aklimate':'AKLIMATE', 
    'cloudforest':'CF',
    'jadbio':'jadbio',
    'skgrid':'skgrid',
    'subscope':'subSCOPE'
}

# pull up info used in docker models
with open('../tools/model_info.json', 'r') as file:
    info = yaml.safe_load(file)

In [None]:
# read in christina data 
perf_df =pd.read_csv('../../../08_manuscript/featureSetML_TCGA/src/BestModelPerDataTypePerGroup_deduplicated_2022_06_16_fixed_fromChristina.txt', sep='\t')
perf_df

In [None]:
print('TODO update to get performance info for All and MULTI and OVERALL and TOP (note in df it shows All not ALL)')
for algor in info.keys():
    for cancer in info[algor].keys():
        for platform in info[algor][cancer].keys():
            # pull performance info from christina's reduced file
            s1 = perf_df[(perf_df['feature_list_method']==nameconvert[algor])&(perf_df['cohort']==cancer)&(perf_df['datatype']==platform)]
            
            if s1.shape[0]!=0:
                assert s1.shape[0]==1, 'shape is {}'.format(s1.shape)
                performance = list(s1['Mean'])[0]

                # add that performance to info dictionary
                info[algor][cancer][platform]['Mean_Overall_Weighted_F1']=performance
            # if algor wasn't ran for that combo of cancer-platform
            else:
                info[algor][cancer][platform]['Mean_Overall_Weighted_F1'] ='NA'

In [None]:

res = {'Cohort': [], 'Platform': [], 'Algorithm_Method':[], 'Mean_Overall_Weighted_F1':[],'Features':[]}

for algor in info.keys():
    for cancer in info[algor].keys():
        for platform in info[algor][cancer].keys():

            res['Cohort'].append(cancer)
            res['Platform'].append(platform)
            res['Algorithm_Method'].append(algor)
            fts_list = info[algor][cancer][platform]['fts']
#             fts_list = ', '.join([a.split(':')[3] + '('+ a.split(':')[4]+')' for a in fts_list])
            fts_list = ','.join(fts_list)
            res['Features'].append(fts_list)
        
#             perf = float(best[cancer][nameconvert[algor]]['info_'+platform]['Max_overall_weighted_f1'])
#             res['Max_Overall_Weighted_F1'].append(perf)
            
            perf = info[algor][cancer][platform]['Mean_Overall_Weighted_F1']
            res['Mean_Overall_Weighted_F1'].append(perf)      
summary = pd.DataFrame.from_dict(res)
summary = summary.sort_values(by = ['Cohort','Mean_Overall_Weighted_F1', 'Platform','Algorithm_Method'], ascending =[True,False, False, False]).reset_index(drop=True)

In [None]:
# temp fix to drop {'All', 'MULTI', 'OVERALL', 'TOP'} bc not ready with this info yet
print('TODO update so this removal step of all multi overall top is not occuring')
# set(summary[summary['Mean_Overall_Weighted_F1']=='NA']['Platform'])
summary= summary[summary['Mean_Overall_Weighted_F1']!='NA']
summary = summary.sort_values(by = ['Cohort','Mean_Overall_Weighted_F1', 'Platform','Algorithm_Method'], ascending =[True,False, False, False]).reset_index(drop=True)

In [None]:
summary.to_csv('../tools/table_models.tsv', sep='\t', index=False)