# Run Once

In [1]:
# init conversion dictionary
# RUN ONLY ONCE
perform2imp = {}

In [2]:
import pandas as pd
import json

In [3]:
##### hardcoded strings
####

f_imp = '../src/classifier_metrics_20210821/feature_importance.tsv'
df_imp_raw = pd.read_csv(f_imp, sep='\t')

In [4]:
team_options= ['aklimate', 'CF', 'jadbio', 'subSCOPE', 'skgrid']
team_imp_options = ['aklimate', 'cloudforest', 'jadbio', 'subscope', 'skgrid']

# Functions

In [5]:
def skgrid_get_more_model_info(cancer):
    '''skgrid in normal pipeline doesnt have enough info to pinpoint one ft selection
    method and classification so need to pull classification info
    
    Reason: skgrid needs info on classifier to match best model
    '''
    # Open file
    f_pred = '../src/classifier_metrics_20210821/top_performing_models_lte_100_features.tsv'
    df_pred = pd.read_csv(f_pred, sep='\t')
    
    # Get classifcation info
    skgrid_s1 = df_pred[df_pred['feature_list_method']=='skgrid']
    skgrid_s1 = skgrid_s1[skgrid_s1['cohort']==cancer].reset_index(drop=True)
    if skgrid_s1.shape[0]==1:
        selected_skgrid_model = skgrid_s1['model'][0]
        return(
            '## SKGRID ONLY. featureID and model info\n{}\n{}'.format(skgrid_s1['featureID'][0], selected_skgrid_model), 
            selected_skgrid_model
        )
    else:
        return(
            'MULTIPLE TIED PERFORMING MODELS (N={})'.format(skgrid_s1.shape[0]),
            list(skgrid_s1['model'])
        )

# Main Analysis

# first attempt is to cycle through all cancers for one team, then move to next team
# for dev this is easiest

In [6]:
######
cancer_list = ['ACC', 'BLCA', 'BRCA', 'CESC', 'COADREAD', 'ESCC', 'GEA', 'HNSC', 'KIRCKICH', 'KIRP', 'LGGGBM', 'LIHCCHOL', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'SARC', 'SKCM', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UVM']

######
for cancer in cancer_list:
    print(cancer)
    ######
    i = 0 # for team selection
    ######
    # Select Team
    selected_team = team_options[i]
    selected_team_imp = team_imp_options[i]
    print('{} and {} selected from list'.format(selected_team, selected_team_imp))


    # Set up out file
    f_out = '../src/conversions/' + selected_team + '.json'

    # Run only if skgrid for more info to pinpoint model. Outputs variable and logging info
    if selected_team == 'skgrid':
        info , selected_skgrid_model = skgrid_get_more_model_info(cancer)
        print(info)


    # Find top model for a team
    f_top = '../data/figure_panel_a/best_models_{}.tsv'.format(cancer)
    df_top = pd.read_csv(f_top, sep='\t', index_col=0)
    top_models = list(df_top.columns)
    print('Best models options:\n')
    for t in top_models:
        print(t)

    # Create source dictionary of conversions - this need to be tested on all cancers to see if works
    # TODO

    # k:v == team_options to model prefix in top_models
    mini_conversion_prefix = {
        'aklimate' : 'AKLIMATE', 
        'CF' : 'CF', #no change
        'jadbio' : 'jadbio', #no change
        'subSCOPE' : 'subSCOPE', 
        'skgrid' :'skgrid' #nochange
    }


    # Select Team to work on
    team_prefix = mini_conversion_prefix[selected_team]
    for m in top_models:
        if m.startswith(team_prefix):
            selected_model = m
            print(selected_model, '\nwas assigned to\n', selected_team)
            exit


    # Subset for team and cancer
    df_imp = df_imp_raw[df_imp_raw['method']==selected_team_imp]
    keep = []
    for v in df_imp['feature_importance_ID']:
        if cancer in v:
            keep.append(v)
    df_imp= df_imp[df_imp['feature_importance_ID'].isin(keep)].reset_index(drop=True)
    print('{} models found'.format(df_imp.shape[0]))


    # Find row that matches with selected_model description

    # src dictionary specfic to team key words in feature_importance_ID column
    substring_dict = {
        'aklimate' : { # no MIR importances reported
            'CNVR_ONLY' : 'CNVR_ONLY', #nochange
            'GEXP_ONLY' : 'GEXP_ONLY', #nochange
            'METH_ONLY' : 'METH_ONLY', #nochange
            'MULTI_DATA' : 'MULTI_DATA', #nochange
        }
    }

    # src dictionary if no matches from substring_dict. these are the assumed values
    gap_substring_dict = {
        'aklimate' : 'MULTI_DATA'
    }


    # 1. Find substring present in selected model
    found = 'false'
    for potential_substring in substring_dict[selected_team].keys():
        if potential_substring in selected_model:
            lookup_key = potential_substring
            found = 'true'
            exit
    if found == 'false': # if no hits from above
        lookup_key = gap_substring_dict[selected_team]
        print('uses this')
    # 2. Use that to find substring to use in df_imp
    df_lookup_key = substring_dict[selected_team][lookup_key]
    print(df_lookup_key)

    # 3. Find matching model and add to perform2imp 
    for i in range(0, df_imp.shape[0]):
        if df_lookup_key in df_imp['feature_importance_ID'][i]:
            df_model = df_imp.iloc[i,:]['feature_importance_ID']
            print('at row {} found match of\n{}\n\tto\n{}'.format(i, selected_model, df_model))
            perform2imp[selected_model]= df_imp.iloc[i,:]['feature_importance_ID']
            exit

    # Output conversion keys - will overwrite old file with new one each loop
    with open(f_out, 'w') as out:
        out.write(json.dumps(perform2imp))
        out.write('\n')

ACC
aklimate and aklimate selected from list
Best models options:

AKLIMATE_ACC_reduced_model_100_feature_set_ACC
CF_ACC_MIR_Top_100_ACC
jadbio_ACC_GEXP_cumulative_feature_set18_ACC
skgrid_ACC_fbedeBIC_perplatformALL_ACC
subSCOPE-GEXP_2021-04-21_bootstrapfeatures_ACC_ACC
AKLIMATE_ACC_reduced_model_100_feature_set_ACC 
was assigned to
 aklimate
3 models found
uses this
MULTI_DATA
at row 2 found match of
AKLIMATE_ACC_reduced_model_100_feature_set_ACC
	to
AKLIMATE_MULTI_DATA_ACC_20200423_FEATURE_IMPORTANCE
BLCA
aklimate and aklimate selected from list
Best models options:

CF_BLCA_GEXP_Top_100_BLCA
skgrid_BLCA_fbedeBIC_combined_BLCA
AKLIMATE_BLCA_reduced_model_100_feature_set_BLCA
jadbio_BLCA_MULTIDATATYPE_cumulative_feature_set25_BLCA
subSCOPE-GEXP_2021-04-21_bootstrapfeatures_BLCA_BLCA
AKLIMATE_BLCA_reduced_model_100_feature_set_BLCA 
was assigned to
 aklimate
4 models found
uses this
MULTI_DATA
at row 3 found match of
AKLIMATE_BLCA_reduced_model_100_feature_set_BLCA
	to
AKLIMATE_MULTI_

Best models options:

subSCOPE-GEXP_2021-04-21_bootstrapfeatures_OV_OV
AKLIMATE_GEXP_ONLY_OV_reduced_model_100_feature_set_OV
jadbio_OV_MULTIDATATYPE_cumulative_feature_set18_OV
skgrid_OV_fbedeBIC_combined_OV
CF_OV_All_Top_50_OV
AKLIMATE_GEXP_ONLY_OV_reduced_model_100_feature_set_OV 
was assigned to
 aklimate
4 models found
GEXP_ONLY
at row 0 found match of
AKLIMATE_GEXP_ONLY_OV_reduced_model_100_feature_set_OV
	to
AKLIMATE_GEXP_ONLY_OV_20200810_FEATURE_IMPORTANCE
PAAD
aklimate and aklimate selected from list
Best models options:

subSCOPE-GEXP_2021-04-21_bootstrapfeatures_PAAD_PAAD
AKLIMATE_PAAD_reduced_model_20_feature_set_PAAD
CF_PAAD_GEXP_Top_10_PAAD
jadbio_PAAD_GEXP_cumulative_feature_set1_PAAD
skgrid_PAAD_fbedeBIC_combined_PAAD
AKLIMATE_PAAD_reduced_model_20_feature_set_PAAD 
was assigned to
 aklimate
4 models found
uses this
MULTI_DATA
at row 3 found match of
AKLIMATE_PAAD_reduced_model_20_feature_set_PAAD
	to
AKLIMATE_MULTI_DATA_PAAD_20200423_FEATURE_IMPORTANCE
PCPG
aklimate an

# STOP - next section is one at a time, only keeping for deve for now

In [None]:
######
cancer_list = ['ACC', 'BLCA', 'BRCA', 'CESC', 'COADREAD', 'ESCC', 'GEA', 'HNSC', 'KIRCKICH', 'KIRP', 'LGGGBM', 'LIHCCHOL', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'SARC', 'SKCM', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UVM']
cancer = cancer_list[3]
print(cancer)
######

In [None]:
######
i = 0
######
# Select Team
selected_team = team_options[i]
selected_team_imp = team_imp_options[i]
print('{} and {} selected from list'.format(selected_team, selected_team_imp))


# Set up out file
f_out = '../src/conversions/' + selected_team + '.json'

In [None]:
# Run only if skgrid for more info to pinpoint model. Outputs variable and logging info
if selected_team == 'skgrid':
    info , selected_skgrid_model = skgrid_get_more_model_info(cancer)
    print(info)

In [None]:
# Find top model for a team
f_top = '../data/figure_panel_a/best_models_{}.tsv'.format(cancer)
df_top = pd.read_csv(f_top, sep='\t', index_col=0)
top_models = list(df_top.columns)
print('Best models options:\n')
for t in top_models:
    print(t)


In [None]:
# Create source dictionary of conversions - this need to be tested on all cancers to see if works
# TODO

# k:v == team_options to model prefix in top_models
mini_conversion_prefix = {
    'aklimate' : 'AKLIMATE', 
    'CF' : 'CF', #no change
    'jadbio' : 'jadbio', #no change
    'subSCOPE' : 'subSCOPE', 
    'skgrid' :'skgrid' #nochange
}

In [None]:
# Select Team to work on
team_prefix = mini_conversion_prefix[selected_team]
for m in top_models:
    if m.startswith(team_prefix):
        selected_model = m
        print(selected_model, '\nwas assigned to\n', selected_team)
        exit

In [None]:
### Look up feature importance scores ###

In [None]:
# Subset for team and cancer
df_imp = df_imp_raw[df_imp_raw['method']==selected_team_imp]
keep = []
for v in df_imp['feature_importance_ID']:
    if cancer in v:
        keep.append(v)
df_imp= df_imp[df_imp['feature_importance_ID'].isin(keep)].reset_index(drop=True)
print('{} models found'.format(df_imp.shape[0]))
df_imp

In [None]:
# will need to manually pick the row index that matches the selected model description
# ex. if only gexp then select the only gexp model
# skip down 

### Only for skgrid

In [None]:
if selected_team == 'skgrid':
    print('ERROR - MUST RUN CODE IN SEVERAL CELLS BELOW FOR SKGRID ONLY')

In [None]:
# # for sk grid only
# # filter 1
# keep =[]
# for v in df_imp['feature_importance_ID']:
#     if 'criterion=entropy,n_estimators=200' in v:
#         keep.append(v)
# s3 = df_imp[df_imp['feature_importance_ID'].isin(keep)].reset_index(drop=True)

# # filter 2
# keep =[]
# for v in s3['feature_importance_ID']:
#     if 'BRCA_fbedeBIC_perplatformALL' in v:
#         keep.append(v)
        
# s3 = df_imp[df_imp['feature_importance_ID'].isin(keep)].reset_index(drop=True)      
# for a in s3['feature_importance_ID']:
#     print(a)
    

# # df_imp[df_imp['feature_importance_ID']=='RandomForestClassifier(criterion=entropy,n_estimators=200)|skgrid_BRCA_fbedeBIC_perplatformALL|2021-01-13|c']

# # pulled index from above line
# line3921 = json.loads(df_imp.iloc[3921,:]['json_object'])['feature_importance_scores']
# line10135 = json.loads(df_imp.iloc[10135,:]['json_object'])['feature_importance_scores']
# print('are these dups the same? {}'.format(line10135==line3921))


In [None]:
for i in info.strip().split('\n'):
    print(i)
    print()
    if i.startswith('MULTIPLE TIED PERFORMING MODELS'):
        for m in selected_skgrid_model:
            print(m)
            print()

In [None]:
# for sk grid only
# filter 1
keep =[]
for v in df_imp['feature_importance_ID']:
    if 'LogisticRegression' in v: # HERE #
        keep.append(v)
s3 = df_imp[df_imp['feature_importance_ID'].isin(keep)].reset_index(drop=True)

# filter 2
keep = []
for v in s3['feature_importance_ID']:
    if 'C=10,max_iter=500,solver=liblinear' in v: # HERE #
        keep.append(v)
s3 = s3[s3['feature_importance_ID'].isin(keep)].reset_index(drop=True)

# filter 3
keep =[]
for v in s3['feature_importance_ID']:
    if 'fbedeBIC_perplatformALL' in v: # HERE #
        keep.append(v)
        
s3 = df_imp[df_imp['feature_importance_ID'].isin(keep)].reset_index(drop=True) 
for a in s3['feature_importance_ID']:
    print(a)
    
df_imp[df_imp['feature_importance_ID']==a]

In [None]:
# # debug
# for s in s3['feature_importance_ID']:
#     print(s)

In [None]:
# CHECK if there are dups that the dups are the same
# pulled index from above line
lineA = json.loads(df_imp.iloc[498,:]['json_object'])['feature_importance_scores']
lineB = json.loads(df_imp.iloc[6712,:]['json_object'])['feature_importance_scores']
print('are these dups the same? {}'.format(lineA==lineB))


### Back to every team

In [None]:
selected_model

In [None]:
selected_team

In [None]:
df_imp

In [None]:
# Find row that matches with selected_model description

# src dictionary specfic to team key words in feature_importance_ID column
substring_dict = {
    'aklimate' : { # no MIR importances reported
        'CNVR_ONLY' : 'CNVR_ONLY', #nochange
        'GEXP_ONLY' : 'GEXP_ONLY', #nochange
        'METH_ONLY' : 'METH_ONLY', #nochange
        'MULTI_DATA' : 'MULTI_DATA', #nochange
    }
}

# src dictionary if no matches from substring_dict. these are the assumed values
gap_substring_dict = {
    'aklimate' : 'MULTI_DATA'
}

In [None]:
# 1. Find substring present in selected model
found = 'false'
for potential_substring in substring_dict[selected_team].keys():
    if potential_substring in selected_model:
        lookup_key = potential_substring
        found = 'true'
        exit
if found == 'false': # if no hits from above
    lookup_key = gap_substring_dict[selected_team]
    print('uses this')
# 2. Use that to find substring to use in df_imp
df_lookup_key = substring_dict[selected_team][lookup_key]
print(df_lookup_key)

In [None]:
# 3. Find matching model and add to perform2imp 
for i in range(0, df_imp.shape[0]):
    if df_lookup_key in df_imp['feature_importance_ID'][i]:
        df_model = df_imp.iloc[i,:]['feature_importance_ID']
        print('at row {} found match of\n{}\n\tto\n{}'.format(i, selected_model, df_model))
        perform2imp[selected_model]= df_imp.iloc[i,:]['feature_importance_ID']
        exit

In [None]:
for k,v in perform2imp.items():
    print('{} \n\t{}'.format(k,v))
    print()

In [None]:
# Output conversion keys
with open(f_out, 'w') as out:
    out.write(json.dumps(perform2imp))
    out.write('\n')