In [None]:
import pandas as pd
import yaml

reverse_name = {
    'AKLIMATE':'aklimate', 
    'CF':'cloudforest',
    'jadbio':'jadbio',
    'skgrid':'skgrid',
    'subSCOPE':'subscope'
}


harmonize_MULTI = {
    # does not contain subscope
    'OVERALL_skgrid': 'MULTI',
    'All_cloudforest': 'MULTI',
    'MULTI_jadbio': 'MULTI', # no change
    'MULTI_aklimate': 'MULTI', # no change
}


def get_MULTI_fts(FT_DF, CANCER, ALGOR, PLATFORM):
    '''get the feature sets of MULTI PLATFORM'''
    nameconvert = {
        'aklimate':'AKLIMATE', 
        'cloudforest':'CF',
        'jadbio':'jadbio',
        'skgrid':'skgrid',
        'subscope':'subSCOPE'
    }

    assert ALGOR in ['subscope', 'aklimate', 'cloudforest', 'skgrid', 'jadbio'], '{} not in specified options of [subscope, aklimate, cloudforest, skgrid, jadbio]'.format(algor)
    
    # Subscope did not have a MULTI model
    if algor =='subscope':
        return 'NA'
    
    else:
        final_fts = []
        # get model details
        ft_id = best[CANCER][nameconvert[ALGOR]]['info_'+PLATFORM]['full_featureID']

        # filter FT_DF for CANCER and ALGOR
        s1 = FT_DF[(FT_DF['feature_list_cohort']==CANCER)&(FT_DF['feature_list_method']==nameconvert[ALGOR])]

        # get fts for that model
        a = [fid for fid in s1.index if ft_id in fid]
        assert len(a)==1, 'error {} was not found in s1'.format(ft_id)
        s1 = s1.loc[a[0],]

        # add fts if present 
        s1 = s1.drop(['feature_list_method','feature_list_cohort','feature_list_size'])
        for i in range(0, s1.shape[0]):
            value = s1[i]
            if value == '1':
                ft = s1.index[i]
                final_fts.append(ft)

        return final_fts


In [None]:
# open christina's filed
perf_df =pd.read_csv('../src/BestModelPerDataTypePerGroup_deduplicated_2022_06_16_fixed_fromChristina.txt', sep='\t')
perf_df

In [None]:
# pull up info used in docker models
with open('../../../09_docker/gdan-tmp-models/tools/model_info.json', 'r') as file:
    info = yaml.safe_load(file)
    
with open('../data/table_docker_info/options_extended_100ftmax.yml', 'r') as fh2:
    best = yaml.load(fh2, Loader=yaml.Loader)
    
# open file with ft info
# prep, drop meta col
file = '../src/classifier_metrics_20220511/collected_features_matrix.tsv'
ft_df = pd.read_csv(file, sep='\t', index_col=0, low_memory=False)
ft_df = ft_df.drop('total_number_of_lists', axis=1)
ft_df = ft_df.transpose()

In [None]:
harmonize_BESTwithinAlgor = {
    'TOP_aklimate': 'pass',
     'OVERALL_subscope': 'MULTI',
     'OVERALL_cloudforest': 'MULTI',
}



In [None]:
# combine all info into 1 df
print('TODO make sure docker command is TOP when dictionary shows alimate for OVERALL')
res = {'Command':[], 'Mean_Overall_Weighted_F1':[],'Std_overall_weighted_f1':[], 'Cohort': [], 'Platform': [], 'Algorithm_Method':[],'Features':[]}
for cancer in best.keys():
    for algor in best[cancer].keys():
        
        platform_cyles = [a for a in best[cancer][algor] if 'info' in a]
        for pc in platform_cyles:
            platform = pc.split('_')[1]
            
            # if there is model performance info:
            if 'NO_MODEL' not in best[cancer][algor][platform]:
                
                # MULTI - special handling of names (ex. OVERALL means MULTI)
                if platform + '_' + reverse_name[algor] in harmonize_MULTI:
                    #print('Harmonizing MULTI -', algor, platform)

                    res['Command'].append('bash RUN_model.sh {} {} {} DATA.tsv'.format(cancer, platform, reverse_name[algor]))

                    mean_value = best[cancer][algor][pc]['Mean_overall_weighted_f1']
                    res['Mean_Overall_Weighted_F1'].append(mean_value)

                    sd_value = best[cancer][algor][pc]['Std_overall_weighted_f1']
                    res['Std_overall_weighted_f1'].append(sd_value)

                    res['Cohort'].append(cancer)   

                    # Table will show col as MULTI - but command will show what was used in model (All, OVERALL, etc)
                    res['Platform'].append('MULTI')

                    res['Algorithm_Method'].append(reverse_name[algor])

                    fts = get_MULTI_fts(ft_df, cancer, reverse_name[algor], platform)
                    res['Features'].append(fts)            
                
                # SKIP any best of the best calculations for now
                # meaning: AKLIMATE: TOP, subSCOPE: OVERALL, CF: OVERALL
                elif  platform + '_' + reverse_name[algor] in  harmonize_BESTwithinAlgor:
                    #print('Skipping best within algor -', algor, platform)
                    break
                    
                
                # standard just GEXP, MUTA, METH, CNVR, MIR
                else:
                    # TODO skipping for now until resolved with chris and christina
                    if platform == 'METH' and algor == 'AKLIMATE' and cancer == 'BRCA':
                        print('TODO handle for this, waiting to hear back from chris and christina on this fix, skipping for now', platform, algor, cancer)
                        break
                    elif platform == 'GEXP' and algor == 'AKLIMATE' and cancer == 'PAAD':
                        print('TODO handle for this, waiting to hear back from chris and christina on this fix, skipping for now', platform, algor, cancer)
                        break
                    elif platform == 'CNVR' and algor == 'AKLIMATE' and cancer == 'SKCM':
                        print('TODO handle for this, waiting to hear back from chris and christina on this fix, skipping for now', platform, algor, cancer)
                        break
#                     elif  algor == 'CF' and cancer == 'UVM':
#                         print('TODO need to fix this cloudforest for all UVM')
#                         break
                    else:
                        res['Command'].append('bash RUN_model.sh {} {} {} YOUR-DATA.tsv'.format(cancer, platform, reverse_name[algor]))

                        mean_value = best[cancer][algor][pc]['Mean_overall_weighted_f1']
                        res['Mean_Overall_Weighted_F1'].append(mean_value)

                        sd_value = best[cancer][algor][pc]['Std_overall_weighted_f1']
                        res['Std_overall_weighted_f1'].append(sd_value)

                        res['Cohort'].append(cancer)

                        res['Platform'].append(platform)

                        res['Algorithm_Method'].append(reverse_name[algor])

                        fts = info[reverse_name[algor]][cancer][platform]['fts']
                        res['Features'].append(fts)
                        
summary = pd.DataFrame.from_dict(res)
summary

In [None]:
print(algor, cancer, platform)

In [None]:
### inspect issues: aklimate
# 1. AKLIMATE BRCA METH
# 2. AKLIMATE PAAD GEXP
# 3. AKLIMATE SKCM CNVR

# in christinas file?
perf_df[(perf_df['cohort']==cancer)&(perf_df['feature_list_method']==algor)]
# yes in christinas file

# in docker source files?
# no

In [None]:
### inspect issues: cloudforest
# 1. CF UVM CNVR - odd that info only has OVERALL as key for cloudforest 
#    1B. will need to add to info all of UVM cloudforest for   
#   - CNVR
#   - GEXP
#   - METH
#   - All
#   - MUTA
#   - MIR
#   - skip(OVERALL)
# in christinas file?
# perf_df[(perf_df['cohort']==cancer)&(perf_df['feature_list_method']==algor)]
# yes

# in docker data = cloudforest/data/models_cf/UVM/CNVR?
#yes
# all 5 single platforms plus All and OVERALL are found in option.yaml and are runaable
import yaml
import pandas as pd
with open('../../../09_docker/gdan-tmp-models/tools/model_info.json', 'r') as file:
    info = yaml.safe_load(file)


In [None]:
info['cloudforest']['BRCA']

In [None]:
to_add = {}



In [None]:
df =pd.read_csv('../../../09_docker/gdan-tmp-models/cloudforest/data/models_cf/UVM/All/CF_UVM_All_Top_100_UVM.fi', sep='\t', index_col=0)
fts = list(df.index)

to_add['All']={'model': 'cloudforest',
  'model_params': 'CF_UVM_All_Top_100_UVM',
  'fts': fts}

In [None]:
to_add

In [None]:
plat = 'CNVR'
df =pd.read_csv('../../../09_docker/gdan-tmp-models/cloudforest/data/models_cf/UVM/{}/CF_UVM_{}_Top_100_UVM.fi'.format(plat, plat), sep='\t', index_col=0)
fts = list(df.index)

to_add[plat]={'model': 'cloudforest',
  'model_params': 'CF_UVM_CNVR_Top_100_UVM',
  'fts': fts}

In [None]:
plat = 'GEXP'
df =pd.read_csv('../../../09_docker/gdan-tmp-models/cloudforest/data/models_cf/UVM/{}/CF_UVM_{}_Top_100_UVM.fi'.format(plat, plat), sep='\t', index_col=0)
fts = list(df.index)

to_add[plat]={'model': 'cloudforest',
  'model_params': 'CF_UVM_GEXP_Top_100_UVM',
  'fts': fts}

In [None]:
plat = 'METH'
df =pd.read_csv('../../../09_docker/gdan-tmp-models/cloudforest/data/models_cf/UVM/{}/CF_UVM_{}_Top_100_UVM.fi'.format(plat, plat), sep='\t', index_col=0)
fts = list(df.index)

to_add[plat]={'model': 'cloudforest',
  'model_params': 'CF_UVM_METH_Top_100_UVM',
  'fts': fts}

In [None]:
plat = 'MIR'
df =pd.read_csv('../../../09_docker/gdan-tmp-models/cloudforest/data/models_cf/UVM/{}/CF_UVM_{}_Top_100_UVM.fi'.format(plat, plat), sep='\t', index_col=0)
fts = list(df.index)

to_add[plat]={'model': 'cloudforest',
  'model_params': 'CF_UVM_MIR_Top_100_UVM',
  'fts': fts}

In [None]:
plat = 'MUTA'
df =pd.read_csv('../../../09_docker/gdan-tmp-models/cloudforest/data/models_cf/UVM/{}/CF_UVM_{}_Top_100_UVM.fi'.format(plat, plat), sep='\t', index_col=0)
fts = list(df.index)

to_add[plat]={'model': 'cloudforest',
  'model_params': 'CF_UVM_MUTA_Top_100_UVM',
  'fts': fts}

In [None]:
plat = 'OVERALL'
df =pd.read_csv('../../../09_docker/gdan-tmp-models/cloudforest/data/models_cf/UVM/OVERALL/CF_UVM_GEXP_Top_100_UVM.fi', sep='\t', index_col=0)
fts = list(df.index)

to_add[plat]={'model': 'cloudforest',
  'model_params': 'CF_UVM_GEXP_Top_100_UVM',
  'fts': fts}

In [None]:
import yaml
import pandas as pd
with open('../../../09_docker/gdan-tmp-models/tools/model_info.v2.json', 'r') as file:
    info = yaml.safe_load(file)


In [None]:
info['cloudforest']['UVM']= to_add

In [None]:
info['cloudforest']['UVM'].keys()

In [None]:
import json
with open("../../../09_docker/gdan-tmp-models/tools/model_info.v2.json", "w") as outfile: 
    json.dump(info, outfile)

In [None]:
info['skgrid']['UVM']