In [2]:
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from cuppa.sample_data.cuppa_features import FeatureLoaderNew
from cuppa.classifier.cuppa_classifier import CuppaClassifier
from cuppa.compose.pipeline import PipelineCrossValidator

In [3]:
## Suppress log messages
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [4]:
## Set working directory as the path of the pycuppa package
os.chdir("../..")

# Training data

Features are provided as a pandas dataframe of shape <code>n_samples x n_features</code>.

In [5]:
features = pd.read_table("cuppa/resources/mock_data/training_data/features.tsv.gz", index_col=0)
features

Unnamed: 0,gen_pos.1_0,gen_pos.1_500000,gen_pos.1_1000000,gen_pos.1_1500000,gen_pos.1_2000000,...,alt_sj.Y;21868494;21868680,alt_sj.Y;21877890;21877985,alt_sj.Y;27009558;27010593,alt_sj.Y;28437284;28463009,alt_sj.Y;28658119;28661203
0_Breast,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0
1_Breast,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0
2_Breast,1,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0
3_Breast,0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0
4_Breast,0,1,0,2,0,...,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
93_AML,0,0,0,0,0,...,,,,,
94_AML,0,0,0,0,0,...,,,,,
95_AML,0,0,0,0,0,...,,,,,
96_AML,0,0,0,0,0,...,,,,,


The metadata provides the training labels for each sample (<code>CancerSubtype</code>).

In [6]:
metadata = pd.read_table("cuppa/resources/mock_data/training_data/metadata.tsv", index_col=0)
metadata

Unnamed: 0,CancerType,CancerSubtype,RnaReadLength
0_Breast,Breast,Breast,151
1_Breast,Breast,Breast,151
2_Breast,Breast,Breast,151
3_Breast,Breast,Breast,151
4_Breast,Breast,Breast,151
...,...,...,...
93_AML,Myeloid,AML,0
94_AML,Myeloid,AML,0
95_AML,Myeloid,AML,0
96_AML,Myeloid,AML,0


Some gene fusions are known to be cancer type specific, e.g. RUNX1-RUNX1T1 for acute myeloid leukemia (AML), but were not prevalent enough in the training set (i.e. too few training samples) to be picked up as dominant feature for that cancer type.

We therefore provide 'fusion overrides' to boost the probabilities of such cancer types when such known fusions are present in a sample.

In [7]:
fusion_overrides_path = "cuppa/resources/mock_data/training_data/fusion_overrides.tsv"
fusion_overrides = pd.read_table(fusion_overrides_path)
fusion_overrides

Unnamed: 0,feat_prefix,feat_basename,target_class
0,event.fusion.,CBFB_MYH11,AML
1,event.fusion.,RUNX1_RUNX1T1,AML


# Classifier training

## Training

CUPPA is built off the [sklearn]("https://scikit-learn.org/stable/") machine learning python library. Throughout the sklearn library, features are referred to as capital <code>X</code> and sample labels as lower case <code>y</code>. We will also use this notation here. 

In [8]:
X = features
y = metadata["CancerSubtype"]

The prefix of the feature names (column names in <code>X</code>) denote the feature type. E.g. the feature <code>snv96.C>A_ACA</code> has the feature type <code>snv96</code>. Below are the names of all feature types.

In [9]:
X.columns\
    .str.split(".", n=1, expand=True)\
    .get_level_values(0)\
    .unique()

Index(['gen_pos', 'snv96', 'event', 'sig', 'gene_exp', 'alt_sj'], dtype='object')

CUPPA is based of the sklearn <code>Pipeline</code> that chains together transformations of the features (each blue cell is a transformer object). Within the <code>Pipeline</code> are <code>ColumnTransformer</code> objects that delegates different sets of transformations to each feature type.

In [10]:
classifier = CuppaClassifier(fusion_overrides_path=fusion_overrides_path)

We can train the classifier using the <code>fit()</code> method.

In [11]:
classifier.fit(X=X, y=y)

## Cross-validation

To assess classifier performance of the classifier on new samples, we can use 10-fold cross-validation to assess classifier performance. The training data is split into 10 parts, training was performed on 90% of samples, and tested on the remaining 10% of samples. This is repeated for the 10 different 'folds', each yielding cancer type probabilities for a different 10% subset of samples, and ultimately for every sample in the training set. These probabilities are then used to calculate various performance metrics.

We aim to perform **stratified** cross-validation using <code>CancerSubtype</code> as the label. This ensures that the folds are made preserving the percentage of samples for each cancer type. However, only a subset of samples has both RNA and DNA data. We therefore need to use <code>CancerSubtype</code> together with <code>RnaReadLength</code> as the sample labels to make cross-validation splits. This ensures that the percentage of samples for each cancer type within the subset of samples with RNA is preserved.

In [12]:
y_split = metadata["CancerSubtype"] + "__" + metadata["RnaReadLength"].astype(str)
y_split

0_Breast    Breast__151
1_Breast    Breast__151
2_Breast    Breast__151
3_Breast    Breast__151
4_Breast    Breast__151
               ...     
93_AML           AML__0
94_AML           AML__0
95_AML           AML__0
96_AML           AML__0
97_AML           AML__0
Length: 98, dtype: object

We initialize the cross-validation using <code>PipelineCrossValidator</code> and use the <code>fit()</code> method to actually run the cross-validation.

In [13]:
cross_validator = PipelineCrossValidator(
    pipeline=CuppaClassifier(fusion_overrides_path=fusion_overrides_path),
    X=X,
    y=y, 
    y_split=y_split,
    cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
)
cross_validator.fit(cache_training=False)

The <code>predict()</code> method from the <CuppaClassifier> class can be called on all 10 test sets using the below code.

In [14]:
predictions = cross_validator.apply_on_test_sets(method_name="predict", probs_only=False, n_jobs=1)

Performance metrics can then be calculated from a prediction summary

In [15]:
pred_summ = predictions.summarize(actual_classes=y)
performance = pred_summ.performance()
performance.head(n=10)

Unnamed: 0,class,clf_name,n_total,n_predicted,n_correct,recall,precision
0,.All,combined,78,0,70,0.897436,
1,.All,dna_combined,98,0,94,0.959184,
2,.All,rna_combined,78,0,64,0.820513,
3,.All,gen_pos,98,0,91,0.928571,
4,.All,snv96,98,0,86,0.877551,
5,.All,event,98,0,69,0.704082,
6,.All,gene_exp,78,0,58,0.74359,
7,.All,alt_sj,78,0,38,0.487179,
0,AML,combined,0,0,0,,
5,AML,dna_combined,12,13,12,1.0,0.923077
