In [None]:
#!/usr/bin/python

import pandas as pd
import argparse
import matplotlib.pyplot as plt


In [None]:
# def get_arguments():
#     parser = argparse.ArgumentParser(description='')
#     parser.add_argument("-t", "--tumor", help ="cancer cohort", required=True, type=str)
#     parser.add_argument("-m", "--metric", help ="classification performance metric", required=True, type=str)
#     parser.add_argument("-fil", "--filters", help ="none or integer for feature set filter max size to be considered for best model", required=True, type=str)
#     parser.add_argument("-f1", "--file1_fts", help ="classification performance file with all groups", required=True, type=str)
#     parser.add_argument("-f2", "--file2_perform", help ="feature set file with all groups", required=True, type=str)
#     parser.add_argument("-o", "--out", help ="output file", required=True, type=str)
#     return parser.parse_args()

# args = get_arguments()
# cancer = args.tumor
# pmetric = args.metric
# file_fts = args.file1_fts
# file_preds = args.file2_perform
# file_output = args.out
# filters = args.filters

cancer = 'BRCA'
pmetric = 'overall_weighted_f1'
file_fts = '../../src/collected_features_matrix_20200722.tsv.gz'
file_preds = '../../src/feature_list_with_performance_with_subtype_names_20200828.tsv.gz'
file_output = 'data/figure_panel_a/TESTING_best_models_' + cancer + '.tsv'
filters = 'none'



In [None]:
cohorts = [
    'ACC',
    'BLCA',
    'BRCA',
    'CESC',
    'COADREAD',
    'ESCC',
    'GEA',
    'HNSC',
    'KIRCKICH',
    'KIRP',
    'LGGGBM',
    'LIHCCHOL',
    'LUAD',
    'LUSC',
    'MESO',
    'OV',
    'PAAD',
    'PCPG',
    'PRAD',
    'SARC',
    'SKCM',
    'TGCT',
    'THCA',
    'THYM',
    'UCEC',
    'UVM'
]

In [None]:
############# Hardcoded Object
groups = ['gnosis', 'CF|All', 'AKLIMATE', 'nn', ['rfe15', 'fbedeBIC']]
#############

performance_df = pd.read_csv(file_preds, sep = '\t', low_memory=False)

In [None]:
for cancer in cohorts:
    # For a group: select best model
    ct = 1
    best = []
    for group in groups:
        # all models from that group for the 3 criteria
        if type(group)== list:
            subset = performance_df[performance_df['feature_list_method'].isin(['rfe15', 'fbedeBIC'])]
            subset = subset[subset['cohort'] == cancer]
            subset = subset[subset['performance_metric'] == pmetric].reset_index(drop=True)
            if filters != 'none':
                max_ft_size = int(filters)
                subset = subset[subset['total_features'] < max_ft_size].reset_index(drop=True)
            subset = subset.sort_values(by='Mean', ascending=False).reset_index(drop=True)
        else:
            print('###', group, '###')
            subset = performance_df[performance_df['feature_list_method'] == group]
            subset = subset[subset['cohort'] == cancer]
            subset = subset[subset['performance_metric'] == pmetric].reset_index(drop=True)
            if filters != 'none':
                max_ft_size = int(filters)
                subset = subset[subset['total_features'] < max_ft_size].reset_index(drop=True)
            subset = subset.sort_values(by='Mean', ascending=False).reset_index(drop=True)
        # Get table info for best model
        tab1 = subset.sort_values(by='Mean', ascending=False).reset_index(drop=True)
        # Grab the name of the model with highest MEAN performance metric
        ftID = tab1['featureID'][0]

        ##
        # Fix naming of CloudForest (to match ft file)
        if "CF|" in ftID:
            ftID='Top '.join(ftID.split('Top_'))
        ##

        best.append(ftID)
        print(ftID, ' selected as best model for group')


        ###### additional plots
        # Create tab for plots to use
        subset2 = tab1[['featureID', 'cohort', 'Mean', 'performance_metric', 'model']]
        subset2['Loss'] = abs(subset2['Mean'].diff())

        # Make plots
        if type(group)== list:
            team = 'SciKitGrid'
        else:
            team = group
        # Plot of performance of models from a given team
        p1= plt.plot(subset2['Mean'])
        p1= plt.xlabel('Model Rank')
        p1= plt.ylabel('Mean {}'.format(pmetric))

        positions = list(range(0, subset2.shape[0]))
        labels = list(subset2['model'])
        p1= plt.xticks(positions, labels)
        p1= plt.xticks(rotation = 90)
        plt.savefig('wip_figures/{}_performance_{}.pdf'.format(cancer, team), bbox_inches ='tight')
        plt.close()


        # Plot of performance loss as decrease ft size of models from a given team
        p2= plt.plot(subset2['Loss'])
        p2= plt.xlabel('Model Rank')
        p2= plt.ylabel('Loss of Mean {}'.format(pmetric))

        positions = list(range(0, subset2.shape[0]))
        labels = list(subset2['model'])
        p2= plt.xticks(positions, labels)
        p2= plt.xticks(rotation = 90)
        plt.savefig('wip_figures/supp/{}_loss_{}.pdf'.format(cancer, team), bbox_inches ='tight')
        plt.close()