In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from utilities_namespace import *

In [3]:
pd.options.mode.chained_assignment = None

### Load data

In [4]:
from helpers.notebooks import notebooks_importer

In [5]:
%%capture
import Breast_cancer_data as data_brca

## Benchmarking

First of all, I define all metrics I will use:

In [6]:
%%capture
from Benchmarking_setup import standard_benchmark

In [7]:
from functools import partial

In [8]:
brca_standard_benchmark = partial(
    standard_benchmark,

    # force the pipline to show full progress and use
    # my custom multiprocessing (which is more reliable)
    per_test_progress=True 
)

### Benchmarking set-up shared across tests

In [9]:
%%capture
from Benchmarking_setup import (
    connectivity_score_cramér_von_mises,
    x_sum,
    score_spearman
)
selected_single_sample_functions = [
    connectivity_score_cramér_von_mises,
    x_sum,
    score_spearman
]

In [10]:
from Benchmarking_setup import (
    gsea_score_phenotypes_cuda_hallmarks,
    gsea_score_phenotypes_cuda_reactome,
    roast_score_by_substance
)
selected_multi_sample_functions = [
    gsea_score_phenotypes_cuda_hallmarks,
    gsea_score_phenotypes_cuda_reactome,
    roast_score_by_substance
]

In [11]:
selected_functions = [
    *selected_single_sample_functions,
    *selected_multi_sample_functions
]

In [12]:
from helpers import developer_mode

# Benchmarking results

## Previously reported cancer stratification

Please refer to "Published BRCA stratifications.ipynb"

In [13]:
%%capture
import Published_BRCA_stratifications

In [14]:
from Published_BRCA_stratifications import (
    # this gets us the participant-cluster relation for different clustering methods
    all_stratifications,
    # this is one of them; different as based only on expression,
    # thus able to explor tumour heterogenity
    pam50_brca
)

In [15]:
%store -r validation_perturbations

In [17]:
validation_perturbations

Unnamed: 0,CPC004_A375_6H:BRD-A25234499-001-09-2:10,CPC004_A375_6H:BRD-A73741725-001-01-0:10,CPC004_A375_6H:BRD-A55393291-001-05-7:10,CPC005_A375_24H:BRD-K88789588-001-03-2:10,CPC004_HA1E_6H:BRD-A25234499-001-09-2:10,...,PCLB003_HCC515_24H:BRD-K67174588-048-03-3:10,PCLB003_HEPG2_24H:BRD-K67174588-048-03-3:10,PCLB003_HT29_24H:BRD-K67174588-048-03-3:10,PCLB003_MCF7_24H:BRD-K67174588-048-03-3:10,PCLB003_PC3_24H:BRD-K67174588-048-03-3:10
b'5720',0.762115,0.073729,0.098680,0.343495,-0.784936,...,0.014511,0.187159,-0.186928,0.960469,-0.069480
b'466',0.118776,-0.229475,0.363593,0.548189,1.567483,...,0.996179,-1.439621,1.530745,1.347902,-0.050531
b'6009',0.193753,0.031712,0.097308,0.362953,1.859892,...,0.173900,-0.028343,0.307923,-0.397352,-0.567132
b'2309',-0.333629,-0.157194,-0.042633,0.561753,-1.318365,...,0.340044,0.510723,0.469443,0.325793,0.107273
b'387',-0.442743,-0.680534,-0.392947,0.046707,0.646655,...,0.092931,0.187357,-1.745912,-0.891801,0.273071
...,...,...,...,...,...,...,...,...,...,...,...
b'25960',-0.235104,0.105069,-0.878641,-0.807497,-0.420527,...,-0.940007,-0.541327,-0.587832,0.537492,-0.162527
b'6376',-0.404811,0.765964,0.473681,0.088523,-0.861003,...,-1.226313,0.475918,-0.181985,-0.949377,-1.659040
b'11033',0.053027,0.266897,0.321963,-0.104970,0.522353,...,0.102483,0.775585,0.443893,-0.600784,-0.730780
b'54869',-0.933307,-0.453832,-0.750093,-0.210786,-0.055980,...,-0.526668,-0.016443,-0.269761,-0.415777,-0.028903


In [25]:
from signature_scoring.evaluation.subtypes import subtypes_benchmark

In [26]:
brca_subtypes_benchmark_only_signatures = partial(
    subtypes_benchmark,
    benchmark_function=brca_standard_benchmark,
    indications_signatures=data_brca.indications_singatures,
    contraindications_signatures=data_brca.contraindications_singatures,
    unassigned_signatures=validation_perturbations
)

In [27]:
brca_subtypes_benchmark = partial(
    brca_subtypes_benchmark_only_signatures,
    data_brca.brca_expression
)

## Benchmark

In [28]:
from data_sources.tcga import stratification
from signature_scoring.evaluation import permutations

In [29]:
from data_sources.tcga.stratification import get_subtype_by_sample, group_by_subtype

In [30]:
pam50_subtype_sample_df = get_subtype_by_sample(data_brca.brca_expression, pam50_brca, subtype_column='subtype_selected')
pam50_subtype_sample_df.head()

1212 matched exactly on sample ID
0 not matched


Unnamed: 0,sample,subtype_selected
0,TCGA-3C-AAAU-01A-11R-A41B-07,BRCA.LumA
1,TCGA-3C-AALI-01A-11R-A41B-07,BRCA.Her2
2,TCGA-3C-AALJ-01A-31R-A41B-07,BRCA.LumB
3,TCGA-3C-AALK-01A-11R-A41B-07,BRCA.LumA
4,TCGA-4H-AAAK-01A-12R-A41B-07,BRCA.LumA


In [31]:
pam50_samples_by_type = group_by_subtype(pam50_subtype_sample_df, subtype_column='subtype_selected')

### Permutations generation

In [32]:
from signature_scoring.evaluation.subtypes import random_subtypes_benchmark

In [33]:
from signature_scoring.evaluation import permutations

In [40]:
generate_permutations = partial(
    permutations.generate,
    random_subtypes_benchmark,
    data_brca.brca_expression,
    pam50_samples_by_type,
    benchmark_partial=partial(
        brca_standard_benchmark,
        indications_signatures=data_brca.indications_singatures,
        contraindications_signatures=data_brca.contraindications_singatures,
        unassigned_signatures=validation_perturbations
    ),
)

In [None]:
generate_permutations(
    funcs=selected_single_sample_functions,
    n=100, pickle_name='ss',
    multi_sample=False
)

In [None]:
generate_permutations(
    funcs=selected_multi_sample_functions,
    n=6, pickle_name='ms_5',
    single_sample=False,
    processes=6
)

In [1]:
100-(50+14)-6

30

In [16]:
%store -r samples_by_statification_and_subtype