In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from utilities_namespace import *
# from helpers import developer_mode

In [6]:
from signature_scoring.scoring_functions import gsea
# from importlib import reload; reload(gsea)

Please run: sudo mount -t tmpfs -o size=2G tmpfs ~/gsea_home
and reload the offending module.
  f'Failed to mount {root} temporary file system.\n'


In [7]:
pd.options.mode.chained_assignment = None

## Load data & methods definitions

In [8]:
from helpers.notebooks import notebooks_importer

In [9]:
%%capture
import Breast_cancer_data as data_brca

In [10]:
%%capture
from Benchmarking_setup import standard_benchmark

In [11]:
from functools import partial

In [12]:
brca_standard_benchmark = partial(
    standard_benchmark,
    # force the pipline to show full progress and use
    # my custom multiprocessing (which is more reliable)
    per_test_progress=True 
)

In [13]:
%%capture
from Selected_functions import (
    selected_single_sample_functions,
    selected_multi_sample_functions
)

In [17]:
%%capture
import Published_BRCA_stratifications

In [18]:
from Published_BRCA_stratifications import (
    # this gets us the participant-cluster relation for different clustering methods
    all_stratifications,
    # this is one of them; different as based only on expression,
    # thus able to explor tumour heterogenity
    pam50_brca
)

In [19]:
%store -r validation_perturbations

In [25]:
from data_sources.tcga.stratification import get_subtype_by_sample, group_by_subtype

In [26]:
pam50_subtype_sample_df = get_subtype_by_sample(data_brca.brca_expression, pam50_brca, subtype_column='subtype_selected')
pam50_subtype_sample_df.head()

1212 matched exactly on sample ID
0 not matched


Unnamed: 0,sample,subtype_selected
0,TCGA-3C-AAAU-01A-11R-A41B-07,BRCA.LumA
1,TCGA-3C-AALI-01A-11R-A41B-07,BRCA.Her2
2,TCGA-3C-AALJ-01A-31R-A41B-07,BRCA.LumB
3,TCGA-3C-AALK-01A-11R-A41B-07,BRCA.LumA
4,TCGA-4H-AAAK-01A-12R-A41B-07,BRCA.LumA


In [27]:
pam50_samples_by_type = group_by_subtype(pam50_subtype_sample_df, subtype_column='subtype_selected')

## Permutations generation

In [28]:
from signature_scoring.evaluation.subtypes import random_subtypes_benchmark

In [29]:
from signature_scoring.evaluation import permutations

In [None]:
def generate_permutations(funcs, stratifications, n, packages, single_sample, comment=''):
    prefix = 'ss' if single_sample else 'ms'
    for name, samples_by_type in samples_by_statification_and_subtype.items():
        print(name)
        generate_permutations_package = partial(
            permutations.generate,
            random_subtypes_benchmark,
            data_brca.brca_expression,
            samples_by_type,
            benchmark_partial=partial(
                brca_standard_benchmark,
                indications_signatures=data_brca.indications_singatures,
                contraindications_signatures=data_brca.contraindications_singatures,
                unassigned_signatures=validation_perturbations
            ),
        )
        for package in range(packages):
            print(package)
            pickle_name = f'{prefix}_{name}_{comment}_{package}'.replace('__', '_')
            generate_permutations_package(
                funcs=funcs,
                n=n, pickle_name=pickle_name,
                single_sample=single_sample,
                multi_sample=not single_sample
            )

### The odd case - PAM50:
PAM50 has subtypes assigned to samples, not participants thus  has data generated separately:

In [31]:
%store -r samples_by_statification_and_subtype

In [None]:
samples_by_statification_and_subtype['pam50'] = pam50_samples_by_type

### Single sample stratifications

Using 10 * 20 = 200 permutations

In [None]:
generate_permutations(
    selected_single_sample_functions,
    samples_by_statification_and_subtype,
    n=10, packages=20, single_sample=True
)

### cudaGSEA functions

had much less permutations (only 50) in the thesis as those can be only calculated on GPU and there was only one GPU-enabled computer available:

In [None]:
cuda_funcs = {
    gsea_score_phenotypes_cuda_hallmarks,
    gsea_score_phenotypes_cuda_reactome
}

In [None]:
generate_permutations(
    list(cuda_funcs),
    samples_by_statification_and_subtype,
    n=5, packages=10, single_sample=False,
    comment='cuda_only'
)

### All the other multi-sample functions have the full 200 permutations

In [None]:
generate_permutations(
    list(set(selected_multi_sample_functions) - cuda_funcs),
    samples_by_statification_and_subtype,
    n=10, packages=20, single_sample=False,
    comment='cuda_only'
)