# Run Once

In [1]:
# init conversion dictionary
# RUN ONLY ONCE
perform2imp = {}

In [2]:
import pandas as pd
import json

In [3]:
##### hardcoded strings
####

f_imp = '../src/classifier_metrics_20210821/feature_importance.tsv'
df_imp_raw = pd.read_csv(f_imp, sep='\t')

In [4]:
team_options= ['aklimate', 'CF', 'jadbio', 'subSCOPE', 'skgrid']
team_imp_options = ['aklimate', 'cloudforest', 'jadbio', 'subscope', 'skgrid']

# Functions

In [5]:
def skgrid_get_more_model_info(cancer):
    '''skgrid in normal pipeline doesnt have enough info to pinpoint one ft selection
    method and classification so need to pull classification info
    
    Reason: skgrid needs info on classifier to match best model
    '''
    # Open file
    f_pred = '../src/classifier_metrics_20210821/top_performing_models_lte_100_features.tsv'
    df_pred = pd.read_csv(f_pred, sep='\t')
    
    # Get classifcation info
    skgrid_s1 = df_pred[df_pred['feature_list_method']=='skgrid']
    skgrid_s1 = skgrid_s1[skgrid_s1['cohort']==cancer].reset_index(drop=True)
    if skgrid_s1.shape[0]==1:
        selected_skgrid_model = skgrid_s1['model'][0]
        return(
            '## SKGRID ONLY. featureID and model info\n{}\n{}'.format(skgrid_s1['featureID'][0], selected_skgrid_model), 
            selected_skgrid_model
        )
    else:
        return(
            'MULTIPLE TIED PERFORMING MODELS (N={})'.format(skgrid_s1.shape[0]),
            list(skgrid_s1['model'])
        )

# Main Analysis

# first attempt is to cycle through all cancers for one team, then move to next team
# for dev this is easiest

In [None]:
######
cancer_list = ['ACC', 'BLCA', 'BRCA', 'CESC', 'COADREAD', 'ESCC', 'GEA', 'HNSC', 'KIRCKICH', 'KIRP', 'LGGGBM', 'LIHCCHOL', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'SARC', 'SKCM', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UVM']

######
for cancer in cancer_list:
    print(cancer)
    ######
    i = 3 # for team selection
    ######
    # Select Team
    selected_team = team_options[i]
    selected_team_imp = team_imp_options[i]
    print('{} and {} selected from list'.format(selected_team, selected_team_imp))


    # Set up out file
    f_out = '../src/conversions/' + selected_team + '.json'

    # Run only if skgrid for more info to pinpoint model. Outputs variable and logging info
    if selected_team == 'skgrid':
        info , selected_skgrid_model = skgrid_get_more_model_info(cancer)
        print(info)


    # Find top model for a team
    f_top = '../data/figure_panel_a/best_models_{}.tsv'.format(cancer)
    df_top = pd.read_csv(f_top, sep='\t', index_col=0)
    top_models = list(df_top.columns)
    print('Best models options:\n')
    for t in top_models:
        print(t)

    # Create source dictionary of conversions - this need to be tested on all cancers to see if works
    # TODO

    # k:v == team_options to model prefix in top_models
    mini_conversion_prefix = {
        'aklimate' : 'AKLIMATE', 
        'CF' : 'CF', #no change
        'jadbio' : 'jadbio', #no change
        'subSCOPE' : 'subSCOPE', 
        'skgrid' :'skgrid' #nochange
    }


    # Select Team to work on
    team_prefix = mini_conversion_prefix[selected_team]
    for m in top_models:
        if m.startswith(team_prefix):
            selected_model = m
            print(selected_model, '\nwas assigned to\n', selected_team)
            exit


    # Subset for team and cancer
    df_imp = df_imp_raw[df_imp_raw['method']==selected_team_imp]
    keep = []
    for v in df_imp['feature_importance_ID']:
        if cancer in v:
            keep.append(v)
    df_imp= df_imp[df_imp['feature_importance_ID'].isin(keep)].reset_index(drop=True)
    print('{} models found'.format(df_imp.shape[0]))

    if selected_team != 'skgrid':
        # Find row that matches with selected_model description
        # src dictionary specfic to team key words in feature_importance_ID column
        substring_dict = {
            'aklimate' : { # no MIR importances reported
                'CNVR_ONLY' : 'CNVR_ONLY', #nochange
                'GEXP_ONLY' : 'GEXP_ONLY', #nochange
                'METH_ONLY' : 'METH_ONLY', #nochange
                'MULTI_DATA' : 'MULTI_DATA', #nochange
            },
            'CF' :{
                'All' : 'All', #nochange
                'CNVR' : 'CNVR', #nochange
                'GEXP' : 'GEXP', #nochange
                'METH' : 'METH', #nochange
                'MIR' : 'MIR', #nochange
                'MUTA' : 'MUTA', #nochange
            },
            'jadbio' : {
                'CNVR' : 'CNVR', #nochange
                'GEXP' : 'GEXP', #nochange
                'METH' : 'METH', #nochange
                'MIR' : 'MIR', #nochange
                'MUTA' : 'MUTA', #nochange
                'MULTIDATATYPE' : 'MULTIDATATYPE', #nochange
            },
            'subSCOPE' : {
                'CNVR' : 'CNVR', #nochange
                'GEXP' : 'GEXP', #nochange
                'METH' : 'METH', #nochange
                'MIR' : 'MIR', #nochange
                'MUTA' : 'MUTA', #nochange
                'ENSEMBLE' : 'ENSEMBLE', #nochange
            }
        }

        # src dictionary if no matches from substring_dict. these are the assumed values
        gap_substring_dict = {
            'aklimate' : 'MULTI_DATA'
        }


        # 1. Find substring present in selected model
        found = 'false'
        for potential_substring in substring_dict[selected_team].keys():
            if potential_substring in selected_model:
                lookup_key = potential_substring
                found = 'true'
                exit
        if found == 'false': # if no hits from above
            lookup_key = gap_substring_dict[selected_team]
            print('uses this')
        # 2. Use that to find substring to use in df_imp
        df_lookup_key = substring_dict[selected_team][lookup_key]
        print(df_lookup_key)

        # 3. Find matching model and add to perform2imp 
        for i in range(0, df_imp.shape[0]):
            if df_lookup_key in df_imp['feature_importance_ID'][i]:
                df_model = df_imp.iloc[i,:]['feature_importance_ID']
                print('at row {} found match of\n{}\n\tto\n{}'.format(i, selected_model, df_model))
                perform2imp[selected_model]= df_imp.iloc[i,:]['feature_importance_ID']
                exit

        # Output conversion keys - will overwrite old file with new one each loop
        with open(f_out, 'w') as out:
            out.write(json.dumps(perform2imp))
            out.write('\n')
            
    else: #for skgrid only
        print('skgrid data goes here')

In [None]:
# ######
# cancer_list = ['ACC', 'BLCA', 'BRCA', 'CESC', 'COADREAD', 'ESCC', 'GEA', 'HNSC', 'KIRCKICH', 'KIRP', 'LGGGBM', 'LIHCCHOL', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'SARC', 'SKCM', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UVM']

# ######
# for cancer in cancer_list:
#     print(cancer)
#     ######
#     i = 3 # for team selection
#     ######
#     # Select Team
#     selected_team = team_options[i]
#     selected_team_imp = team_imp_options[i]
#     print('{} and {} selected from list'.format(selected_team, selected_team_imp))


#     # Set up out file
#     f_out = '../src/conversions/' + selected_team + '.json'

#     # Run only if skgrid for more info to pinpoint model. Outputs variable and logging info
#     if selected_team == 'skgrid':
#         info , selected_skgrid_model = skgrid_get_more_model_info(cancer)
#         print(info)


#     # Find top model for a team
#     f_top = '../data/figure_panel_a/best_models_{}.tsv'.format(cancer)
#     df_top = pd.read_csv(f_top, sep='\t', index_col=0)
#     top_models = list(df_top.columns)
#     print('Best models options:\n')
#     for t in top_models:
#         print(t)

#     # Create source dictionary of conversions - this need to be tested on all cancers to see if works
#     # TODO

#     # k:v == team_options to model prefix in top_models
#     mini_conversion_prefix = {
#         'aklimate' : 'AKLIMATE', 
#         'CF' : 'CF', #no change
#         'jadbio' : 'jadbio', #no change
#         'subSCOPE' : 'subSCOPE', 
#         'skgrid' :'skgrid' #nochange
#     }


#     # Select Team to work on
#     team_prefix = mini_conversion_prefix[selected_team]
#     for m in top_models:
#         if m.startswith(team_prefix):
#             selected_model = m
#             print(selected_model, '\nwas assigned to\n', selected_team)
#             exit


#     # Subset for team and cancer
#     df_imp = df_imp_raw[df_imp_raw['method']==selected_team_imp]
#     keep = []
#     for v in df_imp['feature_importance_ID']:
#         if cancer in v:
#             keep.append(v)
#     df_imp= df_imp[df_imp['feature_importance_ID'].isin(keep)].reset_index(drop=True)
#     print('{} models found'.format(df_imp.shape[0]))


#     # Find row that matches with selected_model description

#     # src dictionary specfic to team key words in feature_importance_ID column
#     substring_dict = {
#         'aklimate' : { # no MIR importances reported
#             'CNVR_ONLY' : 'CNVR_ONLY', #nochange
#             'GEXP_ONLY' : 'GEXP_ONLY', #nochange
#             'METH_ONLY' : 'METH_ONLY', #nochange
#             'MULTI_DATA' : 'MULTI_DATA', #nochange
#         },
#         'CF' :{
#             'All' : 'All', #nochange
#             'CNVR' : 'CNVR', #nochange
#             'GEXP' : 'GEXP', #nochange
#             'METH' : 'METH', #nochange
#             'MIR' : 'MIR', #nochange
#             'MUTA' : 'MUTA', #nochange
#         },
#         'jadbio' : {
#             'CNVR' : 'CNVR', #nochange
#             'GEXP' : 'GEXP', #nochange
#             'METH' : 'METH', #nochange
#             'MIR' : 'MIR', #nochange
#             'MUTA' : 'MUTA', #nochange
#             'MULTIDATATYPE' : 'MULTIDATATYPE', #nochange
#         },
#         'subSCOPE' : {
#             'CNVR' : 'CNVR', #nochange
#             'GEXP' : 'GEXP', #nochange
#             'METH' : 'METH', #nochange
#             'MIR' : 'MIR', #nochange
#             'MUTA' : 'MUTA', #nochange
#             'ENSEMBLE' : 'ENSEMBLE', #nochange
#         }
#     }
    
#     # src dictionary if no matches from substring_dict. these are the assumed values
#     gap_substring_dict = {
#         'aklimate' : 'MULTI_DATA'
#     }


#     # 1. Find substring present in selected model
#     found = 'false'
#     for potential_substring in substring_dict[selected_team].keys():
#         if potential_substring in selected_model:
#             lookup_key = potential_substring
#             found = 'true'
#             exit
#     if found == 'false': # if no hits from above
#         lookup_key = gap_substring_dict[selected_team]
#         print('uses this')
#     # 2. Use that to find substring to use in df_imp
#     df_lookup_key = substring_dict[selected_team][lookup_key]
#     print(df_lookup_key)

#     # 3. Find matching model and add to perform2imp 
#     for i in range(0, df_imp.shape[0]):
#         if df_lookup_key in df_imp['feature_importance_ID'][i]:
#             df_model = df_imp.iloc[i,:]['feature_importance_ID']
#             print('at row {} found match of\n{}\n\tto\n{}'.format(i, selected_model, df_model))
#             perform2imp[selected_model]= df_imp.iloc[i,:]['feature_importance_ID']
#             exit

#     # Output conversion keys - will overwrite old file with new one each loop
#     with open(f_out, 'w') as out:
#         out.write(json.dumps(perform2imp))
#         out.write('\n')

# STOP - next section is one at a time, only keeping for deve for now

In [41]:
######
cancer_list = ['ACC', 'BLCA', 'BRCA', 'CESC', 'COADREAD', 'ESCC', 'GEA', 'HNSC', 'KIRCKICH', 'KIRP', 'LGGGBM', 'LIHCCHOL', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'SARC', 'SKCM', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UVM']
cancer = cancer_list[0]
print(cancer)
######

ACC


In [36]:
# # dev
# practice_cancers = []
# for cancer in cancer_list:
#     ######
#     i = 4
#     ######
#     # Select Team
#     selected_team = team_options[i]
#     selected_team_imp = team_imp_options[i]
# #     print('{} and {} selected from list'.format(selected_team, selected_team_imp))


#     # Set up out file
#     f_out = '../src/conversions/' + selected_team + '.json'
    
#     # Run only if skgrid for more info to pinpoint model. Outputs variable and logging info
#     if selected_team == 'skgrid':
#         info , selected_skgrid_model = skgrid_get_more_model_info(cancer)
#         if 'MULTIPLE TIED PERFORMING MODELS' in info:
# #             practice_cancers.append(cancer)
#             practice_cancers.append(selected_skgrid_model)



In [35]:
# # Use BRCA
# function_list = []
# for i in practice_cancers:
#     b = [a.strip().split('(')[0] for a in i]
#     for item in b:
#         function_list.append(item)
# function_list = set(function_list) # these are functions that I need to double check the wording on
# function_list

In [42]:
######
i = 4
######
# Select Team
selected_team = team_options[i]
selected_team_imp = team_imp_options[i]
print('{} and {} selected from list'.format(selected_team, selected_team_imp))


# Set up out file
f_out = '../src/conversions/' + selected_team + '.json'

skgrid and skgrid selected from list


# need to add this to above 

In [43]:
# Run only if skgrid for more info to pinpoint model. Outputs variable and logging info
if selected_team == 'skgrid':
    info , selected_skgrid_model = skgrid_get_more_model_info(cancer)
    print(info)

## SKGRID ONLY. featureID and model info
skgrid_ACC_fbedeBIC_perplatformALL_ACC
skgrid_ExtraTrees(criterion=gini,n_estimators=128)|skgrid_ACC.tsv_skgrid_ACC_fbedeBIC_perplatformALL|ACC.tsv_skgrid_ACC_fbedeBIC_perplatformALL|2021-01-13|c


In [44]:
# Find top model for a team
f_top = '../data/figure_panel_a/best_models_{}.tsv'.format(cancer)
df_top = pd.read_csv(f_top, sep='\t', index_col=0)
top_models = list(df_top.columns)
print('Best models options:\n')
for t in top_models:
    print(t)


Best models options:

AKLIMATE_ACC_reduced_model_100_feature_set_ACC
CF_ACC_MIR_Top_100_ACC
jadbio_ACC_GEXP_cumulative_feature_set18_ACC
skgrid_ACC_fbedeBIC_perplatformALL_ACC
subSCOPE-GEXP_2021-04-21_bootstrapfeatures_ACC_ACC


In [45]:
# Create source dictionary of conversions - this need to be tested on all cancers to see if works
# TODO

# k:v == team_options to model prefix in top_models
mini_conversion_prefix = {
    'aklimate' : 'AKLIMATE', 
    'CF' : 'CF', #no change
    'jadbio' : 'jadbio', #no change
    'subSCOPE' : 'subSCOPE', 
    'skgrid' :'skgrid' #nochange
}

In [46]:
# Select Team to work on
team_prefix = mini_conversion_prefix[selected_team]
for m in top_models:
    if m.startswith(team_prefix):
        selected_model = m
        print(selected_model, '\nwas assigned to\n', selected_team)
        exit

skgrid_ACC_fbedeBIC_perplatformALL_ACC 
was assigned to
 skgrid


In [47]:
### Look up feature importance scores ###

In [48]:
# Subset for team and cancer
df_imp = df_imp_raw[df_imp_raw['method']==selected_team_imp]
keep = []
for v in df_imp['feature_importance_ID']:
    if cancer in v:
        keep.append(v)
df_imp= df_imp[df_imp['feature_importance_ID'].isin(keep)].reset_index(drop=True)
print('{} models found'.format(df_imp.shape[0]))
# df_imp

6214 models found


### Only for skgrid

In [15]:
# updatedupdated
skgrid_function_dict = {
    'ExtraTrees' : 'ExtraTreesClassifier',
    'LogisticRegression' : 'LogisticRegression', #nochange
    'RandomForest' : 'RandomForestClassifier',
    'AdaBoost' : 'AdaBoostClassifier', 
    'BernoulliNB' : 'BernoulliNB', #nochange
    'DecisionTree' : 'DecisionTreeClassifier',
    'GaussianNB' : 'GaussianNB', #nochange
    'GaussianProcess' : 'GaussianProcessClassifier',
    'KNeighbors' : 'KNeighborsClassifier',
    'SGD' : 'SGDClassifier',
    'SVC' : 'SVC', #nochange
}

In [16]:
# updatedupdated

# input skgrid selected_model object
input_model = selected_skgrid_model

# Remove skgrid prefix
modified_model = '_'.join(input_model.strip().split('_')[1:])
modified_model

'RandomForest(criterion=entropy,n_estimators=150)|skgrid_KIRP.tsv_skgrid_KIRP_fbedeBIC_perplatformALL|KIRP.tsv_skgrid_KIRP_fbedeBIC_perplatformALL|2021-01-13|c'

In [17]:
# Update ML function name if needed
func_name = modified_model.split('(')[0]
new_func_name = skgrid_function_dict[func_name]
modified_model = new_func_name +'('+'('.join(modified_model.split('(')[1:])
modified_model

'RandomForestClassifier(criterion=entropy,n_estimators=150)|skgrid_KIRP.tsv_skgrid_KIRP_fbedeBIC_perplatformALL|KIRP.tsv_skgrid_KIRP_fbedeBIC_perplatformALL|2021-01-13|c'

In [18]:
# Clean up ft selection and other misc info in model string
modified_model = modified_model.strip().split('|')
del modified_model[1] # remove second item

new_ft_method = modified_model[1].split('.tsv_')[1] # remove cancer.tsv_ prefix

final_model = '|'.join([modified_model[0], new_ft_method, modified_model[2], modified_model[3]])

In [19]:
# Now look up model in df
df_imp =df_imp[df_imp['feature_importance_ID']==final_model].reset_index(drop=True)

assert df_imp.shape[0] == 1

In [20]:
# 3. Find matching model and add to perform2imp 
perform2imp[selected_model]=df_imp['feature_importance_ID'][0]

In [21]:
perform2imp

{'skgrid_KIRP_fbedeBIC_perplatformALL_KIRP': 'RandomForestClassifier(criterion=entropy,n_estimators=150)|skgrid_KIRP_fbedeBIC_perplatformALL|2021-01-13|c'}

### Back to every team

In [None]:
selected_model

In [None]:
selected_team

In [None]:
df_imp

In [None]:
# Find row that matches with selected_model description

# updatedupdated

# src dictionary specfic to team key words in feature_importance_ID column
substring_dict = {
    'aklimate' : { # no MIR importances reported
        'CNVR_ONLY' : 'CNVR_ONLY', #nochange
        'GEXP_ONLY' : 'GEXP_ONLY', #nochange
        'METH_ONLY' : 'METH_ONLY', #nochange
        'MULTI_DATA' : 'MULTI_DATA', #nochange
    },
    'CF' :{
        'All' : 'All', #nochange
        'CNVR' : 'CNVR', #nochange
        'GEXP' : 'GEXP', #nochange
        'METH' : 'METH', #nochange
        'MIR' : 'MIR', #nochange
        'MUTA' : 'MUTA', #nochange
    },
    'jadbio' : {
        'CNVR' : 'CNVR', #nochange
        'GEXP' : 'GEXP', #nochange
        'METH' : 'METH', #nochange
        'MIR' : 'MIR', #nochange
        'MUTA' : 'MUTA', #nochange
        'MULTIDATATYPE' : 'MULTIDATATYPE', #nochange
    }
}

# src dictionary if no matches from substring_dict. these are the assumed values
gap_substring_dict = {
    'aklimate' : 'MULTI_DATA'
}

In [None]:
# 1. Find substring present in selected model
found = 'false'
for potential_substring in substring_dict[selected_team].keys():
    if potential_substring in selected_model:
        lookup_key = potential_substring
        found = 'true'
        exit
if found == 'false': # if no hits from above
    lookup_key = gap_substring_dict[selected_team]
    print('uses this')
# 2. Use that to find substring to use in df_imp
df_lookup_key = substring_dict[selected_team][lookup_key]
print(df_lookup_key)

In [None]:
# 3. Find matching model and add to perform2imp 
for i in range(0, df_imp.shape[0]):
    if df_lookup_key in df_imp['feature_importance_ID'][i]:
        df_model = df_imp.iloc[i,:]['feature_importance_ID']
        print('at row {} found match of\n{}\n\tto\n{}'.format(i, selected_model, df_model))
        perform2imp[selected_model]= df_imp.iloc[i,:]['feature_importance_ID']
        exit

In [None]:
for k,v in perform2imp.items():
    print('{} \n\t{}'.format(k,v))
    print()

In [None]:
# Output conversion keys
with open(f_out, 'w') as out:
    out.write(json.dumps(perform2imp))
    out.write('\n')