In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from utilities_namespace import *

### Load data

For explanations please see Data.ipynb

In [2]:
from helpers.notebooks import notebooks_importer, skip_on_import

In [3]:
%%capture
import Data as data

# Scoring signatures

In [4]:
from signature_scoring.scoring_functions.generic_scorers import (
    score_spearman, score_spearman_max,
    x_sum, x_sum_max,
    x_product, x_product_max
)

Here are some examples of scoring metrics:

In [5]:
show_source(score_spearman)

Corresponding *max* variant:

In [6]:
show_source(score_spearman_max)

In [7]:
show_source(x_product)

### Scoring

In [8]:
from signature_scoring import score_signatures

`score_signatures` uses multiprocessing to apply given scoring function to:
- a `disease_signature` (disease differential profile) and
- given `signatures` (or all signatures from the cell line with the most perturbagenes)

It will limit the number of analyzed under- and over-expressed genes to specified `limit`

In [9]:
show_source(score_signatures)

Pre-processing, cache management and multiprocessing are handled by `SignatureProcessor` (showing only the most relevant parts for brevity):

In [10]:
from signature_scoring.processor import SignatureProcessor
show_source(SignatureProcessor.score_signatures)

In [11]:
show_source(SignatureProcessor.score_signature_group)

Here is an example usage:

In [12]:
%%skip_on_import
scores_indications = score_signatures(score_spearman, data.query_signature, data.indications_singatures, limit=500, progress=False)

Retaining 12276 genes: 99.58% of signature genes and 59.79% of query genes


In [13]:
%%skip_on_import
scores_indications

Unnamed: 0,score,sig_id,pert_id,pert_iname,pert_type,...,pert_dose_unit,pert_idose,pert_time,pert_time_unit,pert_itime
19,0.593922,CPC005_MCF7_24H:BRD-K04210847-001-01-1:10,BRD-K04210847,tamoxifen,trt_cp,...,µM,10 µM,24,h,24 h
282,0.564547,LJP001_MCF10A_24H:BRD-K85606544-001-04-2:10,BRD-K85606544,neratinib,trt_cp,...,µM,10 µM,24,h,24 h
250,0.561589,CPC020_MCF7_24H:BRD-K93754473-001-15-1:10,BRD-K93754473,tamoxifen,trt_cp,...,µM,10 µM,24,h,24 h
0,0.512912,CPC005_A375_24H:BRD-K04210847-001-01-1:10,BRD-K04210847,tamoxifen,trt_cp,...,µM,10 µM,24,h,24 h
283,0.505622,LJP001_HS578T_24H:BRD-K51313569-001-03-7:10,BRD-K51313569,palbociclib,trt_cp,...,µM,10 µM,24,h,24 h
...,...,...,...,...,...,...,...,...,...,...,...
141,-0.290436,CPC012_PC3_6H:BRD-M07438658-001-01-1:10,BRD-M07438658,lapatinib,trt_cp,...,µM,10 µM,6,h,6 h
16,-0.315174,CPC004_MCF7_6H:BRD-A28746609-001-05-7:10,BRD-A28746609,paclitaxel,trt_cp,...,µM,10 µM,6,h,6 h
17,-0.357999,CPC004_HT29_6H:BRD-A28746609-001-05-7:10,BRD-A28746609,paclitaxel,trt_cp,...,µM,10 µM,6,h,6 h
27,-0.365098,CPC004_VCAP_6H:BRD-K09631521-001-05-7:10,BRD-K09631521,thiotepa,trt_cp,...,µM,10 µM,6,h,6 h


Returned `Scores` object allows for multiple ways to aggregate the results:

In [14]:
%%skip_on_import
scores_indications.best_per_substance.sort_values('score')

Unnamed: 0_level_0,score
pert_iname,Unnamed: 1_level_1
lapatinib,0.208130
formestane,0.307808
paclitaxel,0.325528
cyclophosphamide,0.342927
docetaxel,0.353591
...,...
epirubicin,0.465137
vinblastine,0.498748
palbociclib,0.505622
neratinib,0.564547


In [15]:
%%skip_on_import
scores_indications.mean_per_substance_dose_and_cell.sort_values('score')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score
pert_iname,pert_idose,cell_id,Unnamed: 3_level_1
paclitaxel,1 µM,U937,-0.413651
thiotepa,10 µM,VCAP,-0.365098
paclitaxel,10 µM,HT29,-0.357999
lapatinib,10 µM,PC3,-0.290436
everolimus,10 µM,NEU,-0.271042
...,...,...,...
palbociclib,10 µM,A375,0.488378
palbociclib,10 µM,HS578T,0.505622
tamoxifen,10 µM,A375,0.512912
neratinib,10 µM,MCF10A,0.564547


### Connectivity Score

In [None]:
import signature_scoring.scoring_functions.connectivity_score as connectivity

In [None]:
connectivity_score = connectivity.create_scorer(
    negative=True,
    ranks_type='signature',#, ranks='complete',   # as tested in Cheng study
    statistic=connectivity.kolmogorov_smirnov,
    compose_tags=connectivity.conditional_difference
)

In [None]:
connectivity_score_cramér_von_mises = connectivity.create_scorer(
    negative=False,
    ranks_type='signature',
    statistic=connectivity.cramér_von_mises,
    compose_tags=connectivity.difference
)

Other combination are possible as well:

In [None]:
connectivity_score_disease = connectivity.create_scorer(negative=True, ranks_type='disease')

In [None]:
connectivity_score_max = connectivity.create_scorer(negative=False, compose_tags=connectivity.max_up_or_down)

### Scoring with Gene Set Enrichment Analysis

For GSEA I decided to use Java implemntation of GSEA 3.0 (released 01-Jul-2017) from the Broad Institute based on the work of [Subramanian, Tamayo, et al., 2005](www.pnas.org/cgi/content/abstract/102/43/15545).

I also briefly evaluated two other implementations: gseapy and pathways-analysis with the first being faster but using too much RAM memory for larger datasets and the second being more memory efficient but much slower.

So far GSEA Desktop provides the best memory-speed trade-off of the tested solutions.

In [16]:
from methods.gsea import GSEADesktop
gsea = GSEADesktop()

I downloaded following datasets from the [Molecular Signatures Database (MSigDB)](http://software.broadinstitute.org/gsea/msigdb/index.jsp):

In [17]:
gsea.msigdb.gene_sets

[{'name': 'c2.all', 'id_type': 'symbols'},
 {'name': 'h.all', 'id_type': 'entrez'},
 {'name': 'c2.cp', 'id_type': 'entrez'},
 {'name': 'c7.all', 'id_type': 'entrez'},
 {'name': 'c2.cgp', 'id_type': 'entrez'},
 {'name': 'c2.cp', 'id_type': 'symbols'},
 {'name': 'h.all', 'id_type': 'symbols'},
 {'name': 'c2.all', 'id_type': 'entrez'},
 {'name': 'c2.cp.reactome', 'id_type': 'entrez'},
 {'name': 'c6.all', 'id_type': 'entrez'},
 {'name': 'c2.cp.reactome', 'id_type': 'symbols'},
 {'name': 'c2.cp.kegg', 'id_type': 'entrez'}]

Briefly explaining the notation:
- *h.all* represents all hallmark gene sets,
- *c2* is a collection of currated gene sets
- *cp* stands for canonical pathways
- *c6* contains oncogenic sets while *c7* immunologic

In [18]:
from signature_scoring.scoring_functions.gsea import create_gsea_scorer

The score is calculated based on the sum of normalized enrichment scores (*nes*) between gene sets expressed in disease and impacted by drug.

It is then scaled to the FDR q-values of enrichment scores. Only gene sets with the FDR q-value below the `q_value_cutoff` threshold are compared.

The difference of classes metric is specified as no raw expression data is given to the GSEA - just the diferential profile and a null vector; therefore the first step of analysis is faked (for performence reasons).

As multiple instances of GSEA will be launched for scoring, the RAM usage might increase significantly. At least 1GB of memory per instance is required.

Below the code of an example GSEA scoring metric is attached:

In [None]:
gsea_score = create_gsea_scorer(permutations=300, gene_sets='h.all', q_value_cutoff=0.1)

In [None]:
show_source(gsea_score)

Unfortunately, even with limited number of genes and perturbations and use of many cores, the GSEA calculations take their time:

In [None]:
%%skip_on_import
gsea_scores_indications = score_signatures(gsea_score, data.query_signature, data.indications_singatures, limit=300, progress=True, processes=12)

Retaining 12276 genes: 99.58% of signature genes and 59.79% of query genes


100%|██████████| 315/315 [02:01<00:00,  2.64it/s]

In [None]:
%%skip_on_import
gsea_scores_indications.best_per_substance

Unnamed: 0_level_0,score
pert_iname,Unnamed: 1_level_1
tamoxifen,0.746441
methotrexate,0.727242
epirubicin,0.724217
doxorubicin,0.697340
palbociclib,0.678138
...,...
docetaxel,0.529660
thiotepa,0.506804
paclitaxel,0.458713
lapatinib,0.442709


In [None]:
gsea_score = create_gsea_scorer(permutations=1000, gene_sets='c2.cp.kegg', q_value_cutoff=0.05)
gsea_score_reactome = create_gsea_scorer(permutations=1000, gene_sets='c2.cp.reactome', q_value_cutoff=0.05)
gsea_score_hallmarks = create_gsea_scorer(permutations=1000, gene_sets='h.all', q_value_cutoff=0.05)

### Limma

In [None]:
from signature_scoring.scoring_functions.limma import create_roast_scorer, roast

In [None]:
show_source(create_roast_scorer)

In [None]:
show_source(roast)

For the sake of mroast, which requires differnt kind of input than all the other scoring functions, adjustments were made to the ScoringFunction class, to inform other parts of the pipline to change the behaviour, so that the input of the kind requested by the scoring function will be presented:

In [None]:
from signature_scoring.scoring_functions import ScoringFunction
show_source(ScoringFunction)

In [None]:
roast_score = create_roast_scorer()

In [None]:
roast_score_by_substance = create_roast_scorer(gene_sets='c2.cp.kegg', q_value_cutoff=0.05)  

In [None]:
repr(data.brca_with_controls)

'<TCGAExpressionWithControls: 1100 cases, 112 controls>'

In [None]:
%%skip_on_import
scores_indications = score_signatures(
    roast_score,
    data.brca_with_controls, data.indications_with_controls,
    progress=True, processes=12,
)

Retaining 12276 genes: 99.58% of signature genes and 59.79% of query genes
Selected only 12276 genes out of 1000 allowed.


  0%|          | 0/13 [00:00<?, ?it/s]

Following columns contain nulls and will be skipped: ['CPC015_HT29_6H:BRD-K04548931-003-05-8:10_control_control_control', 'CPC015_SKB_24H:BRD-K04548931-003-05-8:10_control_control_control', 'NMH002_FIBRNPC_24H:BRD-A56518012-003-03-7:10_control_control_control', 'NMH002_NEU_6H:BRD-A56518012-003-03-7:10_control_control_control', 'NMH002_NPC_6H:BRD-A56518012-003-03-7:10_control_control_control']
Following columns contain nulls and will be skipped: ['LJP001_HS578T_24H:BRD-K51313569-001-03-7:10_control_control_control', 'LJP001_MDAMB231_24H:BRD-K51313569-001-03-7:0.4_control_control_control']
Following columns contain nulls and will be skipped: ['LJP001_MCF10A_6H:BRD-K19687926-379-03-3:10_control_control_control', 'LJP001_MDAMB231_24H:BRD-K19687926-379-03-3:10_control_control_control']
Following columns contain nulls and will be skipped: ['CPC014_HEPG2_6H:BRD-K13514097-001-01-2:10_control_control_control']
Following columns contain nulls and will be skipped: ['CPC015_A375_6H:BRD-K09631521-0

 31%|███       | 4/13 [01:12<03:41, 24.62s/it]

Following columns contain nulls and will be skipped: ['CPC004_HCC515_6H:BRD-A28746609-001-05-7:10_control_control_control']


 54%|█████▍    | 7/13 [01:13<01:13, 12.26s/it]

Following columns contain nulls and will be skipped: ['CPC006_A549_6H:BRD-K85606544-001-01-8:10_control_control_control', 'CPC006_NOMO1_6H:BRD-K85606544-001-01-8:10_control_control_control', 'CPC006_THP1_6H:BRD-K85606544-001-01-8:10_control_control_control', 'LJP001_HS578T_24H:BRD-K85606544-001-04-2:10_control_control_control', 'LJP001_MDAMB231_24H:BRD-K85606544-001-04-2:10_control_control_control']


 77%|███████▋  | 10/13 [01:17<00:20,  6.73s/it]

Following columns contain nulls and will be skipped: ['AML001_CD34_6H:BRD-K93754473:13.4584_control_control_control', 'CPC006_NOMO1_6H:BRD-K93754473-001-06-0:28.39_control_control_control', 'CPC006_THP1_6H:BRD-K93754473-001-06-0:28.39_control_control_control', 'NMH002_FIBRNPC_24H:BRD-K93754473-048-14-5:10_control_control_control', 'NMH002_NEU_6H:BRD-K93754473-001-16-9:10_control_control_control', 'NMH002_NPC_6H:BRD-K93754473-048-14-5:10_control_control_control']


100%|██████████| 13/13 [01:23<00:00,  4.46s/it]

In [None]:
%%skip_on_import
scores_indications.mean_per_substance_dose_and_cell

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score
pert_iname,pert_idose,cell_id,Unnamed: 3_level_1
doxorubicin,10 µM,HT29,0.736493
doxorubicin,10 µM,VCAP,0.736493
doxorubicin,10 µM,PHH,0.736493
doxorubicin,10 µM,PC3,0.736493
doxorubicin,10 µM,MCF7,0.736493
...,...,...,...
thiotepa,10 µM,PC3,-0.117309
thiotepa,10 µM,PHH,-0.117309
thiotepa,10 µM,SKB,-0.117309
thiotepa,10 µM,VCAP,-0.117309


### GSVA

#### Multi-sample

In [None]:
from signature_scoring.scoring_functions.gsva import create_gsva_scorer

In [None]:
gsva_score_by_substance = create_gsva_scorer(gene_sets='c2.cp.kegg', q_value_cutoff=0.05)
zscore_by_substance = create_gsva_scorer(gene_sets='c2.cp.kegg', q_value_cutoff=0.05, method='zscore')
plage_score_by_substance = create_gsva_scorer(gene_sets='c2.cp.kegg', q_value_cutoff=0.05, method='plage')
ssgsea_score_by_substance = create_gsva_scorer(gene_sets='c2.cp.kegg', q_value_cutoff=0.05, method='ssgsea')

In [None]:
%%skip_on_import
scores_indications = score_signatures(
    gsva_score_by_substance,
    data.brca_with_controls, data.indications_with_controls,
    progress=True, processes=12,
)

Retaining 12276 genes: 99.58% of signature genes and 59.79% of query genes
Selected only 12276 genes out of 1000 allowed.


  0%|          | 0/13 [00:00<?, ?it/s]

Following columns contain nulls and will be skipped: ['LJP001_MCF10A_6H:BRD-K19687926-379-03-3:10_control_control', 'LJP001_MDAMB231_24H:BRD-K19687926-379-03-3:10_control_control']
Following columns contain nulls and will be skipped: ['LJP001_HS578T_24H:BRD-K51313569-001-03-7:10_control_control', 'LJP001_MDAMB231_24H:BRD-K51313569-001-03-7:0.4_control_control']


  8%|▊         | 1/13 [06:26<1:17:23, 386.98s/it]

Following columns contain nulls and will be skipped: ['AML001_CD34_6H:BRD-K93754473:13.4584_control_control', 'CPC006_NOMO1_6H:BRD-K93754473-001-06-0:28.39_control_control', 'CPC006_THP1_6H:BRD-K93754473-001-06-0:28.39_control_control', 'NMH002_FIBRNPC_24H:BRD-K93754473-048-14-5:10_control_control', 'NMH002_NEU_6H:BRD-K93754473-001-16-9:10_control_control', 'NMH002_NPC_6H:BRD-K93754473-048-14-5:10_control_control']


 38%|███▊      | 5/13 [06:30<12:28, 93.52s/it] 

Following columns contain nulls and will be skipped: ['CPC006_NOMO1_6H:BRD-K85606544-001-01-8:10_control_control', 'CPC006_THP1_6H:BRD-K85606544-001-01-8:10_control_control', 'LJP001_HS578T_24H:BRD-K85606544-001-04-2:10_control_control', 'LJP001_MDAMB231_24H:BRD-K85606544-001-04-2:10_control_control']
Following columns contain nulls and will be skipped: ['CPC004_PC3_6H:BRD-K04548931-003-05-8:10_control_control', 'NMH002_FIBRNPC_24H:BRD-A56518012-003-03-7:10_control_control', 'NMH002_NEU_6H:BRD-A56518012-003-03-7:10_control_control', 'NMH002_NPC_6H:BRD-A56518012-003-03-7:10_control_control']


100%|██████████| 13/13 [06:49<00:00,  9.56s/it]

In [None]:
%%skip_on_import
scores_indications

Unnamed: 0,score,sig_id,pert_id,pert_iname,pert_type,...,pert_dose_unit,pert_idose,pert_time,pert_time_unit,pert_itime
17,0.798635,CPC014_A549_6H:BRD-K13514097-001-01-2:10,BRD-K13514097,everolimus,trt_cp,...,µM,10 µM,6,h,6 h
25,0.798635,CPC014_NPC_24H:BRD-K13514097-001-01-2:10,BRD-K13514097,everolimus,trt_cp,...,µM,10 µM,24,h,24 h
19,0.798635,CPC014_HA1E_6H:BRD-K13514097-001-01-2:10,BRD-K13514097,everolimus,trt_cp,...,µM,10 µM,6,h,6 h
20,0.798635,CPC014_HCC515_6H:BRD-K13514097-001-01-2:10,BRD-K13514097,everolimus,trt_cp,...,µM,10 µM,6,h,6 h
21,0.798635,CPC014_HEPG2_6H:BRD-K13514097-001-01-2:10,BRD-K13514097,everolimus,trt_cp,...,µM,10 µM,6,h,6 h
...,...,...,...,...,...,...,...,...,...,...,...
67,-0.278135,CPC015_A375_6H:BRD-K09631521-001-05-7:10,BRD-K09631521,thiotepa,trt_cp,...,µM,10 µM,6,h,6 h
62,-0.278135,CPC004_HA1E_6H:BRD-K09631521-001-05-7:10,BRD-K09631521,thiotepa,trt_cp,...,µM,10 µM,6,h,6 h
63,-0.278135,CPC004_HCC515_6H:BRD-K09631521-001-05-7:10,BRD-K09631521,thiotepa,trt_cp,...,µM,10 µM,6,h,6 h
66,-0.278135,CPC004_VCAP_6H:BRD-K09631521-001-05-7:10,BRD-K09631521,thiotepa,trt_cp,...,µM,10 µM,6,h,6 h


In [None]:
%%skip_on_import
scores_indications = score_signatures(
    plage_score_by_substance,
    data.brca_with_controls, data.indications_with_controls,
    progress=False, processes=12,
)

Retaining 12276 genes: 99.58% of signature genes and 59.79% of query genes
Selected only 12276 genes out of 1000 allowed.
Following columns contain nulls and will be skipped: ['NMH002_FIBRNPC_24H:BRD-A56518012-003-03-7:10_control_control', 'NMH002_NEU_6H:BRD-A56518012-003-03-7:10_control_control', 'NMH002_NPC_6H:BRD-A56518012-003-03-7:10_control_control']
Following columns contain nulls and will be skipped: ['CPC020_VCAP_6H:BRD-A09722536-002-13-1:10_control_control']
Following columns contain nulls and will be skipped: ['LJP001_MCF10A_6H:BRD-K19687926-379-03-3:10_control_control', 'LJP001_MDAMB231_24H:BRD-K19687926-379-03-3:10_control_control']
Following columns contain nulls and will be skipped: ['CPC014_A549_6H:BRD-K51313569-001-02-9:10_control_control', 'CPC014_ASC_24H:BRD-K51313569-001-02-9:10_control_control', 'LJP001_HS578T_24H:BRD-K51313569-001-03-7:10_control_control', 'LJP001_MDAMB231_24H:BRD-K51313569-001-03-7:0.4_control_control']
Following columns contain nulls and will be 

In [None]:
%%skip_on_import
scores_indications

Unnamed: 0,score,sig_id,pert_id,pert_iname,pert_type,...,pert_dose_unit,pert_idose,pert_time,pert_time_unit,pert_itime
178,0.326742,CPC003_VCAP_6H:BRD-A55594068-065-03-1:10,BRD-A55594068,vinblastine,trt_cp,...,µM,10 µM,6,h,6 h
179,0.326742,CPC015_A375_6H:BRD-A55594068-065-03-1:10,BRD-A55594068,vinblastine,trt_cp,...,µM,10 µM,6,h,6 h
195,0.326742,CVD001_PHH_24H:BRD-K01188359-065-10-8:2.5,BRD-K01188359,vinblastine,trt_cp,...,µM,3 µM,24,h,24 h
194,0.326742,CVD001_HUH7_24H:BRD-K01188359-065-10-8:10,BRD-K01188359,vinblastine,trt_cp,...,µM,10 µM,24,h,24 h
193,0.326742,CVD001_HEPG2_24H:BRD-K01188359-065-10-8:10,BRD-K01188359,vinblastine,trt_cp,...,µM,10 µM,24,h,24 h
...,...,...,...,...,...,...,...,...,...,...,...
31,-1.302662,CPC004_HA1E_6H:BRD-K04548931-003-05-8:10,BRD-K04548931,epirubicin,trt_cp,...,µM,10 µM,6,h,6 h
32,-1.302662,CPC004_HCC515_6H:BRD-K04548931-003-05-8:10,BRD-K04548931,epirubicin,trt_cp,...,µM,10 µM,6,h,6 h
33,-1.302662,CPC004_HEPG2_6H:BRD-K04548931-003-05-8:10,BRD-K04548931,epirubicin,trt_cp,...,µM,10 µM,6,h,6 h
34,-1.302662,CPC004_PC3_6H:BRD-K04548931-003-05-8:10,BRD-K04548931,epirubicin,trt_cp,...,µM,10 µM,6,h,6 h


#### Single sample (early differential)

In [None]:
from functools import partial

In [47]:
from signature_scoring.scoring_functions.gsva import create_gsva_scorer
single_sample_gsva = partial(create_gsva_scorer, gene_sets='c2.cp.kegg', q_value_cutoff=0.2, single_sample=True, grouping=None, permutations=10)

gsva_score = single_sample_gsva(method='gsva')
ssgsea_score = single_sample_gsva(method='ssgsea')
plage_score = single_sample_gsva(method='plage')
zscore = single_sample_gsva(method='zscore')

In [49]:
# this one takes a lot of time

In [None]:
%%skip_on_import
ssgsea_scores_indications = score_signatures(
    zscore, data.query_signature, data.indications_singatures, limit=400, progress=True, processes=12
)

In [None]:
%%skip_on_import
ssgsea_scores_indications

## Evaluation of scoring metrics

In [50]:
from signature_scoring.evaluation import evaluate, evaluation_summary

There are two essential functions used to evaluate metrics: `evaluate` and `evaluation_summary` (latter called by the former).

`evaluate` prepares signatures by restricting analyzed cell lines according to given criteria, executes scoring function and perform aggregation of the results by cell line (if requested):

In [51]:
show_source(evaluate)

#### Combining data from multiple cell lines

Data from cell lines are combined by mean-averaging; p-values are combined using Fisher's method.

### Evaluation metrics

`evaluation_summary` calculates various evaluation metrics to verify performence of the scoring function:

In [52]:
show_source(evaluation_summary)

Each eavaluation metric has it's objective and belongs to one of four categories:

In [53]:
from signature_scoring.evaluation.metrics import EvaluationMetric

show_source(EvaluationMetric)

Following metrics were defined:

In [54]:
from signature_scoring.evaluation.metrics import metrics_manager

defined_metrics = metrics_manager.defined_metrics_table()

with pd.option_context('display.max_colwidth', -1):
    display(HTML(defined_metrics.to_html(escape=False)))

Unnamed: 0_level_0,Unnamed: 1_level_0,code,combine,objective
category,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
indications,Precision,"def precision(scores):  return calc.precision(true_positives=scores.top.indications, all_selected=scores.top.all)",mean,
indications,Recall,"def recall(scores):  return calc.recall(true_positives=scores.top.indications, all_positives=scores.indications)",mean,
indications,F1 Score,"def f1_score(scores):  selected = scores.top  return calc.f1(  calc.precision(true_positives=selected.indications, all_selected=selected.all),  calc.recall(true_positives=selected.indications, all_positives=scores.indications)  )",mean,maximize
indications,Mean,def mean(scores):  return scores.indications.mean(),mean,
contraindications,F1 Score,"def f1_score(scores):  selected = scores.top  return calc.f1(  calc.precision(true_positives=selected.contraindications, all_selected=selected.all),  calc.recall(true_positives=selected.contraindications, all_positives=scores.contraindications)  )",mean,minimize
contraindications,Is Mean Better,def is_mean_better(scores):  return scores.indications.mean() > scores.contraindications.mean(),mean,maximize
contraindications,Mean,def mean(scores):  return scores.contraindications.mean(),mean,
contraindications,KS p-value,"def ks_p(scores):  # See ks_p controls metric for explanation of alternative='less'  ks = r_ks_test(scores.indications, scores.contraindications, alternative='less')  return ks['p.value'][0]",fisher_method,minimize
contraindications,AUC ROC,def roc(scores):  return calc.generalized_roc_auc_score(scores.vector_contraindications),mean,maximize
contraindications,Indications Prioritized,@on_division_by_zero(fill_with=np.nan) def indications_prioritized(scores):  return (  len(scores.indications[scores.indications > scores.contraindications.max()])  /  len(scores.contraindications)  ),mean,maximize


The utility functions used for calculations are shown below: 

In [55]:
import signature_scoring.evaluation.calculation_utilities as calc
show_source(calc)

### Selecting cell lines

In [56]:
from signature_scoring.evaluation import select_cells

When `cell_lines_ratio` is specified, perturbagenes are filtered to keep only those from cell lines for which no less than given proportion of analyzed perturbagens was evaluated.

Also presence of at least one perturbagene in each analyzed class (indications, contraindications, controls) is required.

In [57]:
show_source(select_cells)

What's worth noting is that there is a number of cell lines in which large numbers of substances were tested; there are also some cell lines with just a few dozens of records:

In [59]:
show_table(DataFrame(data.dcm.sig_info.cell_id.value_counts()).T, n_rows=20)

Unnamed: 0,VCAP,MCF7,PC3,A549,A375,HT29,HA1E,HCC515,HEPG2,NPC,...,NCIH1694,T3M10,SNUC5,NCIH1836,U2OS,CD34,U266,MCH58,HS27A,NCIH716
cell_id,68553,63367,58945,46736,42195,38070,34205,29199,28195,9585,...,368,368,366,366,359,352,273,27,24,19


### Benchmarking