In [None]:
# note this is PART of the code from get_model_info_ALL.ipynb

import pandas as pd
import yaml
import numpy as np

### 1. Get performance info
# Get top performing models
df = pd.read_csv('../src/classifier_metrics_20220511/big_results_matrix.tsv', sep='\t', low_memory=False)

platform_options = {
    'CF':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'All', 'OVERALL'],
    'AKLIMATE':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'MULTI', 'TOP'], 
    'skgrid':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'OVERALL'],
    'subSCOPE':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'OVERALL'],
    'jadbio':['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR', 'MULTI']

}

groups = ['AKLIMATE', 'CF', 'jadbio', 'skgrid', 'subSCOPE']
cancers = ['BRCA', 'LGGGBM', 'COADREAD', 'SKCM', 'ACC', 'BLCA', 'CESC', 'ESCC', 'GEA', 'HNSC', 'KIRCKICH', 'KIRP', 'LIHCCHOL', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'SARC', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UVM']
pmetric = 'overall_weighted_f1'
filters = 100 # max size of ft list
    
best = {}
for cancer in cancers: 
    print(cancer)
    # For a group: select best model
    ct = 1
    cancer_dict = {}
    for group in groups:
        
        ## uncomment out this if want to set a ftset size max##
        # remove rows where no feature list len provided
        # note this might be a spot need to fix later
        df = df[df['total_features']!= '__NO_LIST__'].reset_index(drop=True)
        # change type of col
        df['total_features'] = df['total_features'].astype('int')

        # Grabs all models for: method, cancer, overall_weighted_f1
        subset = df[df['feature_list_method'] == group]
        subset = subset[subset['cohort'] == cancer]
        subset = subset[subset['performance_metric'] == pmetric].reset_index(drop=True)
        
        # uncomment out this if want to set a ftset size max##
        # filter for max ft size
        max_ft_size = int(filters)
        subset = subset[subset['total_features'] <= max_ft_size].reset_index(drop=True)
        subset = subset.sort_values(by='Mean', ascending=False)

        # Get the model for each platform
        platforms = platform_options[group]
        for platform in platforms:
            col_options = ['GEXP_features','CNVR_features','MIR_features','MUTA_features','METH_features']

            # filter models specific to group
            if group == 'CF':
                if platform != 'OVERALL':
                    # CF specific. featureid has info of platform at index 3
                    # note: "All" is at index2 (CF_ACC_All_Top_100). must use all datatypes
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[2]==platform]
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                # no filtering done.  "OVERALL" is the best single or multi data type model.
                else:
                    platform_subset = subset
            elif group == 'AKLIMATE':
                print(platform)
                # AKLIMATE specific. featureid has info of platform at index 1
                # single platform is AKLIMATE_METH_ONLY_BRCA_reduced_model_5_feature_set_BRCA
                if platform in ['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR']:    
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[1]==platform]
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                # # "MULTI" can use all platform is AKLIMATE_BRCA_reduced_model_100_feature_set_BRCA
                elif platform =='MULTI':
                    ftid_keep= [f for f in subset['model'] if f.split('_')[1]=='MULTI']
                    platform_subset = subset[subset['model'].isin(ftid_keep)].reset_index(drop=True)
                # no filtering done.  "TOP" is the best single or multi data type model.
                elif platform == 'TOP':
                    platform_subset = subset 
            elif group =='skgrid':
                if platform in ['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR']:
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[3]=='perplatform'+platform]
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                elif platform =='OVERALL':
                    # no filtering done.  "OVERALL" is the best single or multi data type model.
                    platform_subset = subset  
            elif group == 'subSCOPE':
                # note we do not have a statement for "ENSEMBLE" which can use multiple datatypes
                if platform in ['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR']:
                    if platform == 'CNVR':
                        ftid_keep = [f for f in subset['featureID'] if f.split('_')[0]=='subSCOPE-'+platform.replace('R', '')]
                    else:
                        ftid_keep= [f for f in subset['featureID'] if f.split('_')[0]=='subSCOPE-'+platform]
                    platform_subset= subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                elif platform =='OVERALL':
                    # no filtering done. where OVERALL is the best single or multi data type model
                    platform_subset = subset  
            elif group == 'jadbio':
                if platform in ['GEXP' , 'CNVR', 'METH', 'MUTA', 'MIR']:    
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[2]==platform]
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                elif platform =='MULTI':
                    ftid_keep= [f for f in subset['featureID'] if f.split('_')[2]=='MULTIDATATYPE']
                    platform_subset = subset[subset['featureID'].isin(ftid_keep)].reset_index(drop=True)
                elif group== 'OVERALL':
                    # no filtering done. where OVERALL is the best single or multi data type model
                    platform_subset = subset  


            # Grab name of model wit ht eh highest MEWAN performance metic (overall balanced f1)
            platform_subset = platform_subset.sort_values(by='Mean', ascending=False).reset_index(drop=True)
            # if found at least one model
            if platform_subset.shape[0] > 0:
                ftID = platform_subset.sort_values(by='Mean', ascending=False).reset_index(drop=True)['featureID'][0]
            #else no models fitting the above filters, need to finish
            else:
                ftID = 'NO_MODEL_MATCH_' + group
            # note can grab 'model' col at this point too if want, it will be located at row 0

            # save
            if group not in cancer_dict:
                cancer_dict[group] ={platform:ftID}
                # add new extra info
                if ftID.startswith('NO_MODEL_MATCH_'):
                    cancer_dict[group]['info_'+platform]= {'Mean_'+pmetric: np.nan, 
                     'Std_'+pmetric: np.nan, 
                     'Max_'+pmetric:np.nan,
                     'full_featureID': np.nan,
                     'full_model':np.nan,
                    }   
                else:
                    cancer_dict[group]['info_'+platform]= {'Mean_'+pmetric: platform_subset['Mean'][0], 
                     'Std_'+pmetric: platform_subset['Std'][0],
                     'Max_'+pmetric: platform_subset['Max'][0],
                     'full_featureID': platform_subset['featureID'][0],
                     'full_model':platform_subset['model'][0],
                    }                
            else:
                cancer_dict[group][platform] =ftID
                # add new extra info
                if ftID.startswith('NO_MODEL_MATCH_'):
                    cancer_dict[group]['info_'+platform]= {'Mean_'+pmetric:np.nan, 
                     'Std_'+pmetric: np.nan, 
                     'Max_'+pmetric:np.nan,
                     'full_featureID': np.nan,
                     'full_model':np.nan,
                    }   
                else:
                    cancer_dict[group]['info_'+platform]= {'Mean_'+pmetric: platform_subset['Mean'][0], 
                     'Std_'+pmetric: platform_subset['Std'][0], 
                     'Max_'+pmetric: platform_subset['Max'][0],
                     'full_featureID': platform_subset['featureID'][0],
                     'full_model':platform_subset['model'][0],
                    }      
            print(ftID, ' selected as best model for group')
            
    best[cancer]=cancer_dict



# save this ref file 
with open('../data/table_docker_info/options_extended_100ftmax.yml', 'w') as fh:
    yaml.dump(best, fh)

# Choice 1: blinding use christinas data

In [None]:
# # what has been added 
# # OVERALL - cloudforest
# import pandas as pd
# import yaml

# nameconvert = {
#     'aklimate':'AKLIMATE', 
#     'cloudforest':'CF',
#     'jadbio':'jadbio',
#     'skgrid':'skgrid',
#     'subscope':'subSCOPE'
# }

# # pull up info used in docker models
# with open('../../../09_docker/gdan-tmp-models/tools/model_info.json', 'r') as file:
#     info = yaml.safe_load(file)
    
    
    
    
    
# # read in christina data 
# perf_df =pd.read_csv('../src/BestModelPerDataTypePerGroup_deduplicated_2022_06_16_fixed_fromChristina.txt', sep='\t')





# print('TODO update to get performance info for All and MULTI and OVERALL and TOP (note in df it shows All not ALL)')
# for algor in info.keys():
#     for cancer in info[algor].keys():
#         for platform in info[algor][cancer].keys():
#             # pull performance info from christina's reduced file
#             s1 = perf_df[(perf_df['feature_list_method']==nameconvert[algor])&(perf_df['cohort']==cancer)&(perf_df['datatype']==platform)]
            
#             if s1.shape[0]!=0:
#                 assert s1.shape[0]==1, 'shape is {}'.format(s1.shape)
#                 performance = list(s1['Mean'])[0]

#                 # add that performance to info dictionary
#                 info[algor][cancer][platform]['Mean_Overall_Weighted_F1']=performance
#             # if algor wasn't ran for that combo of cancer-platform
#             else:
#                 info[algor][cancer][platform]['Mean_Overall_Weighted_F1'] ='NA'
                

                
                
                
                
                
                
                
# res = {'Cohort': [], 'Platform': [], 'Algorithm_Method':[], 'Mean_Overall_Weighted_F1':[],'Features':[]}

# for algor in info.keys():
#     for cancer in info[algor].keys():
#         for platform in info[algor][cancer].keys():

#             res['Cohort'].append(cancer)
#             res['Platform'].append(platform)
#             res['Algorithm_Method'].append(algor)
#             fts_list = info[algor][cancer][platform]['fts']
# #             fts_list = ', '.join([a.split(':')[3] + '('+ a.split(':')[4]+')' for a in fts_list])
#             fts_list = ','.join(fts_list)
#             res['Features'].append(fts_list)
        
# #             perf = float(best[cancer][nameconvert[algor]]['info_'+platform]['Max_overall_weighted_f1'])
# #             res['Max_Overall_Weighted_F1'].append(perf)
            
#             perf = info[algor][cancer][platform]['Mean_Overall_Weighted_F1']
#             res['Mean_Overall_Weighted_F1'].append(perf)      
# summary = pd.DataFrame.from_dict(res)
# summary = summary.sort_values(by = ['Cohort','Mean_Overall_Weighted_F1', 'Platform','Algorithm_Method'], ascending =[True,False, False, False]).reset_index(drop=True)

# # temp fix to drop {'All', 'MULTI', 'OVERALL', 'TOP'} bc not ready with this info yet
# print('TODO update so this removal step of all multi overall top is not occuring')
# # set(summary[summary['Mean_Overall_Weighted_F1']=='NA']['Platform'])
# summary= summary[summary['Mean_Overall_Weighted_F1']!='NA']
# summary = summary.sort_values(by = ['Cohort','Mean_Overall_Weighted_F1', 'Platform','Algorithm_Method'], ascending =[True,False, False, False]).reset_index(drop=True)
# summary.to_csv('../data/table_docker_info/table_models.tsv', sep='\t', index=False)

# Choice 2: sanity check christina's table and but use info from my data

In [None]:
# move into once tableN_docker models notebook runs (this code does a sanity check with my work)

import pandas as pd
import yaml

extra_options = {
    'CF':['All', 'OVERALL'],
    'AKLIMATE':[ 'MULTI', 'TOP'], 
    'skgrid':[ 'OVERALL'],
    'subSCOPE':[ 'OVERALL'],
    'jadbio':['MULTI']

}

nameconvert = {
    'aklimate':'AKLIMATE', 
    'cloudforest':'CF',
    'jadbio':'jadbio',
    'skgrid':'skgrid',
    'subscope':'subSCOPE'
}

with open('../data/table_docker_info/options_extended_100ftmax.yml', 'r') as fh2:
# with open('../data/table_docker_info/options_extended.yml', 'r') as fh2:
    best = yaml.load(fh2, Loader=yaml.Loader)
# sanity check my filtering matches christina's
# read in christina data 
perf_df =pd.read_csv('../src/BestModelPerDataTypePerGroup_deduplicated_2022_06_16_fixed_fromChristina.txt', sep='\t')

debug_issues = []

for algor in extra_options.keys():
    for cancer in list(best.keys()):
    #     print(cancer)
        check= {}
        for k,v in best[cancer][algor].items():
            if 'info' in k:
                # aka don't count the nans
                if type(v['Mean_overall_weighted_f1'])!=float:
                    # ignore the TOP and OVERALL (will calculate this later)
                    if k.split('_')[1] != 'TOP' and k.split('_')[1]!='OVERALL':
                        check[k.split('_')[1]]=v['Mean_overall_weighted_f1']

        for check_plat in check.keys():
            # alimate, jabio reports MULTI as ALL in christinas table
            # CF reports All as ALL in christinas table
            if check_plat == 'MULTI' or check_plat == 'All': 
#                 print('MULTI for ', algor)
                s1 = perf_df[(perf_df['cohort']==cancer)&(perf_df['feature_list_method']==algor)][['featureID','cohort','performance_metric','Mean','Std','feature_list_method', 'datatype']]
                perf_df_Mean = list(s1[s1['datatype']=='ALL']['Mean'])[0]
                # debug
                if perf_df_Mean != check[check_plat]:
                    debug_issues.append('issue with not matchingup for {} perfdf mean and {} christinas mean. {} {} {}'.format(perf_df_Mean, check[check_plat], cancer, algor, check_plat)  )      
            else:
    #             print('checking')
                s1 = perf_df[(perf_df['cohort']==cancer)&(perf_df['feature_list_method']==algor)][['featureID','cohort','performance_metric','Mean','Std','feature_list_method', 'datatype']]
                perf_df_Mean = list(s1[s1['datatype']==check_plat]['Mean'])[0]
                #debug
                if perf_df_Mean != check[check_plat]:
                    debug_issues.append('issue with not matchingup for {} perfdf mean and {} christinas mean. {} {} {}'.format(perf_df_Mean, check[check_plat],  cancer, algor, check_plat) )

In [None]:
# #### delete this code block
# import pandas as pd
# import yaml

# extra_options = {
#     'CF':['All', 'OVERALL'],
#     'AKLIMATE':[ 'MULTI', 'TOP'], 
#     'skgrid':[ 'OVERALL'],
#     'subSCOPE':[ 'OVERALL'],
#     'jadbio':['MULTI']

# }

# nameconvert = {
#     'aklimate':'AKLIMATE', 
#     'cloudforest':'CF',
#     'jadbio':'jadbio',
#     'skgrid':'skgrid',
#     'subscope':'subSCOPE'
# }


# with open('../data/table_docker_info/options_extended.yml', 'r') as fh2:
#     best = yaml.load(fh2, Loader=yaml.Loader)
# # sanity check my filtering matches christina's
# # read in christina data 
# perf_df =pd.read_csv('../src/BestModelPerDataTypePerGroup_deduplicated_2022_06_16_fixed_fromChristina.txt', sep='\t')

# debug_issues = []

# look at the specific debug_issue manually- christina's file - she shoes 0.529	 for acc aklimate cnvr
perf_df[(perf_df['cohort']=='ACC')&(perf_df['feature_list_method']=='AKLIMATE')]






# # look at specific deubg_issue manually - big results file 
# df = pd.read_csv('../src/classifier_metrics_20220511/big_results_matrix.tsv', sep='\t', low_memory=False)
# Grabs all models for: method, cancer, overall_weighted_f1
subset = df[df['feature_list_method'] == 'AKLIMATE']
subset = subset[subset['cohort'] == 'ACC']
subset = subset[subset['performance_metric'] == 'overall_weighted_f1'].reset_index(drop=True)

## uncomment out this if want to set a ftset size max##
#         # filter for max ft size
#         max_ft_size = int(filters)
#         subset = subset[subset['total_features'] <= max_ft_size].reset_index(drop=True)
subset = subset.sort_values(by='Mean', ascending=False)
subset

In [None]:
print('issues were')
for a in debug_issues:
    print(a)
    print()

### sanity check

AssertionError: issue with not matchingup for 0.529 perfdf mean and 0.553 christinas mean. ACC AKLIMATE CNVR

In [None]:
bmatrix_raw= pd.read_csv('../src/classifier_metrics_20220511/big_results_matrix.tsv', sep='\t', low_memory=False)

#####
grp = 'AKLIMATE'
c = 'ACC'
p = 'CNVR'
#####
bmatrix = bmatrix_raw[bmatrix_raw['feature_list_method'] == grp]
bmatrix = bmatrix[bmatrix['cohort'] == c]
bmatrix = bmatrix[bmatrix['performance_metric'] == pmetric].reset_index(drop=True)
bmatrix = bmatrix.sort_values(by='Mean', ascending=False)
ms = [a for a in bmatrix['model'] if p in a]
bmatrix[bmatrix['model'].isin(ms)]

In [None]:
bmatrix_raw[bmatrix_raw['total_features']!= '__NO_LIST__'].reset_index(drop=True).shape


In [None]:
bmatrix_raw.shape

### end of sanity check

### WIP - continue with adding in the OVERALL category for that algor


In [None]:
with open('../data/table_docker_info/options_extended.yml', 'r') as fh2:
    best = yaml.load(fh2, Loader=yaml.Loader)

# determine which algors need to calculate overall
models_need_overall= []
for k,v in extra_options.items():
    if 'OVERALL' not in v:
        models_need_overall.append(k)
# add in OVERALL = highest performance of all models from that algor
# thus will say which model of AKLIAMTE is OVERALL, then for skgrid, etc
for algor in models_need_overall:
    for cancer in list(best.keys()):
        # find the model 
        overall_4_algor = {'model':'NA', 'score': 0}
        for KEY in [a for a in best[cancer][algor].keys() if 'info' in a]:
            perform = best[cancer][algor][KEY]['Mean_overall_weighted_f1']
            if perform > overall_4_algor['score']:
                overall_4_algor['score']=perform
                overall_4_algor['model']=KEY
        best[cancer][algor]['OVERALL']= best[cancer][algor][KEY.split('_')[1]]
        best[cancer][algor]['info_OVERALL']= best[cancer][algor][overall_4_algor['model']]


In [None]:
# pull up info used in docker models
with open('../../../09_docker/gdan-tmp-models/tools/model_info.json', 'r') as file:
    info = yaml.safe_load(file)

reverse_name = {
    'AKLIMATE':'aklimate', 
    'CF':'cloudforest',
    'jadbio':'jadbio',
    'skgrid':'skgrid',
    'subSCOPE':'subscope'
}


In [None]:
# combine all info into 1 df
# #####
algor = 'CF'
cancer = 'BRCA'
# #####
# skgrid

res = {'Command':[], 'Mean_Overall_Weighted_F1':[],'Std_overall_weighted_f1':[], 'Cohort': [], 'Platform': [], 'Algorithm_Method':[],'Features':[]}
for cancer in best.keys():
    for algor in best[cancer].keys():
        
        platform_cyles = [a for a in best[cancer][algor] if 'info' in a]
        # update the above
        
        for pc in platform_cyles:
            platform = pc.split('_')[1]
            # skip when don't have model info on that 
            if 'NO_MODEL' not in best[cancer][algor][platform]:
                res['Command'].append('bash RUN_model.sh {} {} {} YOUR-DATA.tsv'.format(cancer, platform, reverse_name[algor]))

                mean_value = best[cancer][algor][pc]['Mean_overall_weighted_f1']
                res['Mean_Overall_Weighted_F1'].append(mean_value)

                sd_value = best[cancer][algor][pc]['Std_overall_weighted_f1']
                res['Std_overall_weighted_f1'].append(sd_value)

                res['Cohort'].append(cancer)

                # modify in table TOP to OVERALL but command col will use TOP still
                if platform == 'TOP':
                    res['Platform'].append('OVERALL')
                elif platform == 'All':
                    res['Platform'].append('MULTI')
                else:
                    res['Platform'].append(platform)

                res['Algorithm_Method'].append(reverse_name[algor])

                fts = info[reverse_name[algor]][cancer][platform]['fts']
                res['Features'].append(fts)
            else:
                res['Command'].append('NA')

                res['Mean_Overall_Weighted_F1'].append('NA')

                res['Std_overall_weighted_f1'].append('NA')

                res['Cohort'].append(cancer)

                # modify in table TOP to OVERALL but command col will use TOP still
                if platform == 'TOP':
                    res['Platform'].append('OVERALL')
                elif platform == 'All':
                    res['Platform'].append('MULTI')
                else:
                    res['Platform'].append(platform)

                res['Algorithm_Method'].append(reverse_name[algor])

                res['Features'].append('NA')                



In [None]:
summary = pd.DataFrame.from_dict(res)
summary = summary.sort_values(by = ['Mean_Overall_Weighted_F1','Cohort', 'Platform','Algorithm_Method'], ascending =[False, False,False, False]).reset_index(drop=True)