In [1]:
from helpers.utilities import (
    create_paths, DataFrame, Series, partial, read_csv, read_table, np, pd
)
%run helpers/notebook_setup.ipynb

In [2]:
%R source('plots/colors.R');

In [3]:
__inputs__ = create_paths(path='data/integration/', csv=['patients_with_both'])
clinical_path = 'data/clean/clinical/data_with_derived_variables.csv'

In [4]:
patients_with_both = Series.from_csv(__inputs__['patients_with_both'])

In [5]:
joint_cm_and_tmd = patients_with_both[patients_with_both.str.contains('TMD|CM')]
intersect_cm_and_tmdr = patients_with_both[patients_with_both.str.contains('TMR|TMD|CM')]

In [6]:
clinical = read_csv(clinical_path, index_col=0)

In [7]:
clinical.loc[patients_with_both].Meningitis_with_tuberculosis_status.value_counts()

Cryptococcal             14
Definite tuberculosis     7
Viral                     7
Possible tuberculosis     4
Probable tuberculosis     4
Name: Meningitis_with_tuberculosis_status, dtype: int64

In [8]:
from jupyter_helpers.selective_import import skip_on_import

## Normalization and pre-processing pipeline

In [9]:
raw_protein_path = 'data/clean/protein/gene_levels_by_entrez.csv'
raw_rna_path = 'data/clean/rna/all_samples_counts.csv'

In [10]:
raw_protein_matrix = read_csv(raw_protein_path, index_col=0)
raw_rna_matrix = read_csv(raw_rna_path, index_col=0)

Re-index proteins to use gene names:

In [11]:
id_to_symbol_df = read_table('data/hgnc/entrez_ids_to_gene_symbol.tsv', index_col=3)
id_to_symbol = id_to_symbol_df[id_to_symbol_df.Status == 'Approved']['Approved symbol'].to_dict()

raw_protein_matrix.index = list(
    Series(raw_protein_matrix.index).apply(lambda x: id_to_symbol.get(x, x))
)

Re-index transcripts to use gene names:

In [12]:
from helpers.ensembl import Ensembl
ensembl = Ensembl(95)

rna_matrix_isoforms_collapsed = ensembl.collapse_and_reindex_to(raw_rna_matrix, to='gene_name')
raw_rna_matrix = ensembl.reindex_to(raw_rna_matrix, to='gene_name')

In [13]:
patients_union = set(raw_rna_matrix.columns | raw_protein_matrix.columns)
patients_union_series = Series(list(patients_union))

full_cm_and_tmdr = patients_union_series[patients_union_series.str.contains('TMR|TMD|CM')]

In [14]:
%%skip_on_import
intersect_cm_and_tmdr.map(clinical.Meningitis).value_counts()

The proteins mapped to gene levels were sometimes annotated to two genes as two genes can encode the same (or very similar) protein:

In [15]:
raw_protein_matrix.duplicated().any()

True

In [16]:
%%skip_on_import
raw_protein_matrix[raw_protein_matrix.duplicated(keep=False)]

To enable the use of methods which prohibit exact collinearity I remove the exact duplicates, however retaining the information of alternative gene mappings for future pathway analysis and interpretation; I also keep a copy with all of the genes for the use in gene-gene regression:

In [17]:
raw_protein_matrix_with_duplicates = raw_protein_matrix.copy()

In [18]:
gene_hashes = raw_protein_matrix.apply(
    lambda gene: hash(gene.values.data.tobytes()),
    axis=1
)

In [19]:
assert all(gene_hashes.duplicated() == raw_protein_matrix.duplicated())

In [20]:
collapsed_genes = (
    raw_protein_matrix
    .rename_axis('gene')
    .reset_index()
    .groupby(gene_hashes.values)
    .gene.transform(lambda x: ', '.join(list(x.astype('str'))))
)

In [21]:
raw_protein_matrix.index = collapsed_genes
raw_protein_matrix = raw_protein_matrix.drop_duplicates()

In [22]:
raw_protein_matrix.shape

(1271, 82)

Remember to strip the ensembl id off for the pathways analysis!

In [23]:
clinical_union = clinical.loc[patients_union]

In [24]:
protein_conditions = clinical_union.loc[raw_protein_matrix.columns].Meningitis
protein_conditions.value_counts()

Tuberculosis       26
Healthy control    25
Cryptococcal       24
Viral               7
Name: Meningitis, dtype: int64

In [25]:
rna_conditions = clinical_union.loc[raw_rna_matrix.columns].Meningitis
rna_conditions.value_counts()

Tuberculosis    28
Cryptococcal    18
Viral            8
Bacterial        2
Name: Meningitis, dtype: int64

Note: the group assignments are not used for the the normalization, but the quantities of the groups are used for the filtering step (the number of samples in the smallest group).

As explained in [Differential_expression.ipynb notebook](/analyses/rnaseq_vs_clinical/Differential_expression.ipynb) (*Gene filtering* part), I mask the smallest group for RNA data (bacterial) as it is not used in analyses anyway.

In [26]:
rna_conditions_masked = rna_conditions.replace('Bacterial', 'Tuberculosis')

In [27]:
%%R -i raw_protein_matrix -i raw_rna_matrix -i rna_matrix_isoforms_collapsed -i rna_conditions_masked -i protein_conditions
import::here(normalize_abundance, .from='helpers/preprocessing.R')
import::here(remove_leading_X, .from = 'helpers/utilities.R')
import::here(normalize_abundance, choose_regions_above_the_mean, choose_all_regions, .from='helpers/preprocessing.R')


colnames(raw_rna_matrix) = remove_leading_X(colnames(raw_rna_matrix))
colnames(rna_matrix_isoforms_collapsed) = remove_leading_X(colnames(rna_matrix_isoforms_collapsed))
colnames(raw_protein_matrix) = remove_leading_X(colnames(raw_protein_matrix))


trend_correction_presets = list(
    loess_local=list(
        blind=F,
        choose_regions_to_correct=choose_regions_above_the_mean
    ),
    loess_global=list(
        choose_regions_to_correct=choose_all_regions,
        blind=F
    ),
    loess_global_relative=list(
        choose_regions_to_correct=choose_all_regions,
        blind=F,
        shirink_relative_to_diff=T
    ),
    non_blind=list(blind=T),
    blind=list(blind=F)
)


#MARICES = list(
#    rna=raw_rna_matrix,
#    protein=raw_protein_matrix
#)


normalize = function(
    what, subset=NULL, outliers='warn', collapse_rna_isoforms=FALSE,
    trend_preset=NULL, conditions_vector=NULL, ...
) {
    if (what == 'both')
        return(c(
            normalize('rna', subset, outliers, collapse_rna_isoforms, trend_preset, ...),
            normalize('protein', subset, outliers, collapse_rna_isoforms, trend_preset, ...)
        ))

    if (is.character(what)) {
        matrix = switch(
            what,
            rna=(
                if (collapse_rna_isoforms)
                    rna_matrix_isoforms_collapsed
                else
                    raw_rna_matrix
            ),
            protein=raw_protein_matrix
        )
        conditions_vector = switch(
            what,
            rna=rna_conditions_masked,
            protein=protein_conditions
        )
    } else {
        print(paste0(
            'Using provided matrix. For increased performance in cross validation, ',
            'consider first copying the matrix into the R environment instead, and ',
            'then passing its identifier along with the desired subsets of patients to be used.'
        ))
        if (collapse_rna_isoforms) {
            stop('collapse_rna_isoforms not supported for custom matrices')
        }
        if (is.null(conditions_vector)) {
            stop('conditions vector is required for custom matrices')
        }
        matrix = what
        colnames(matrix) = remove_leading_X(colnames(matrix))
    }

    if (!is.null(outliers)) {
        if (outliers == 'warn') {
            print('Not removing any outliers')
        }
        else {
            is_outlier = colnames(matrix) %in% outliers
            print(paste('Removing', sum(is_outlier), 'outliers'))

            matrix = matrix[, !is_outlier, drop=FALSE]
            conditions_vector = conditions_vector[!is_outlier]
        }
    }
    if (!is.null(subset)) {
        matrix = matrix[, subset, drop=FALSE]
        conditions_vector = conditions_vector[subset]
    }

    if (!is.null(trend_preset))
        trend_args = trend_correction_presets[[trend_preset]]
    else
        trend_args = list()

    normalize_abundance(matrix, conditions_vector, trend_args=trend_args, ...)
}

In [28]:
from helpers.r import r_function

normalize = partial(r_function, 'normalize')

In [29]:
%%skip_on_import
_pa = normalize('protein', subset=intersect_cm_and_tmdr, normalization_method='TMM')

[1] "Not removing any outliers"
[1] "Retaining: 99.61%"


### 1. Outliers

Unsupervised analysis identified outliers in RNA data. I exclude those: 

In [30]:
RNA_OUTLIERS = [
    '175.TMD', '074.TMS', '093.TMD', '128.TMD',
    '158.TMD', '167.TMR', '233.CM'
]

In [31]:
%%skip_on_import
_ra = normalize(
    'rna', subset=intersect_cm_and_tmdr, outliers=RNA_OUTLIERS,
    normalization_method='TMM'
)

[1] "Removing 7 outliers"
[1] "Retaining: 34.65%"


In [32]:
%%skip_on_import
_ra_collapsed_isoforms = normalize(
    'rna', subset=intersect_cm_and_tmdr, outliers=RNA_OUTLIERS,
    normalization_method='TMM', collapse_rna_isoforms=True
)

[1] "Removing 7 outliers"
[1] "Retaining: 35.58%"


### 2, Additional filtering

In [33]:
from machine_learning.preprocessing import LowCountsFilter, LowVarianceFilter, RSideNormalizer

While pre-filtering is done in the normalize_abundance function in R (using edgeR's `edgeR::filterByExpr` function), this is done prior to normalization. This may not enough because:
- prior to the normalization we can only remove the near zero-counts (for RNA-Seq) or very low intensities (for SOMAScan)as otherwise our thresholds would be affected by the library sizes for RNA data.
- addition of median based filtering matches the workflow of SIMCA, which is the standard commercial tool for O2PLS

I reject the variables with too many counts being equal to the median to be of practical interest, using threshold of 33.3% (SIMCA uses threshold of 2).

I demonstrate the extend of the filtering below, using TMM-normalized data:

In [34]:
filter_out_low_count = partial(LowCountsFilter().fit_transform, y=None)

Low count (or "often not too different from median") does not filter out anything:

In [35]:
%%skip_on_import
_raf_isoforms = filter_out_low_count(_ra_collapsed_isoforms)

LowCountsFilter: filtering out 0 out of 0 variables requested to be filtered out (total variables=25)


In [36]:
%%skip_on_import
_raf = filter_out_low_count(_ra)

LowCountsFilter: filtering out 0 out of 0 variables requested to be filtered out (total variables=25)


Proteins were measured by microarrays, thus identical values will be rare:

In [37]:
%%skip_on_import
_paf = filter_out_low_count(_pa)

LowCountsFilter: filtering out 0 out of 0 variables requested to be filtered out (total variables=25)


Therefore, I additionally filter out the observations with very low variance (below 0.1 percentile):

In [38]:
filter_out_very_low_variance = partial(LowVarianceFilter().fit_transform, y=None)

In [39]:
%%skip_on_import
_raf = filter_out_very_low_variance(_raf)

LowVarianceFilter: filtering out 1 out of 1 variables requested to be filtered out (total variables=25)


In [40]:
%%skip_on_import
_paf = filter_out_very_low_variance(_paf)

LowVarianceFilter: filtering out 1 out of 1 variables requested to be filtered out (total variables=25)


In [41]:
%%skip_on_import
common_genes = list(_raf_isoforms.index.intersection(_paf.index))

In [42]:
%%skip_on_import
len(common_genes)

### 3. Preprocessing pipeline

In [43]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from rpy2.robjects import NULL as Null

from machine_learning.utilities import df_keeping
from machine_learning.preprocessing import OutliersFilter, PreFilterLowestExpresion
%R import::here(filter_out_low_expression_by_n, .from='helpers/preprocessing.R')
filter_out_low_expression_by_n = partial(r_function, 'filter_out_low_expression_by_n')


OMICS = {
    'rna': raw_rna_matrix,
    'protein': raw_protein_matrix
}


def preprocess(
    omic, smallest_group_n, outliers,
    subset=None, verbose=False, omics=OMICS,
    normalization_method='TMM', log=True, **kwargs
):

    if isinstance(omic, DataFrame):
        omic_data = omic
    else:
        omic_data = omics[omic]

    if subset is not None:
        omic_data = omic_data.loc[:, subset]

    return make_pipeline(
        OutliersFilter(outlier_patients=outliers, verbose=verbose),
        PreFilterLowestExpresion(
            func=filter_out_low_expression_by_n,
            data=omic_data,
            smallest_group_n=smallest_group_n,
            verbose=verbose
        ),
        RSideNormalizer(
            normalize,
            omic, normalization_method=normalization_method, log=log, filter=False,
            # including other experimental groups to enhance correction for the errors of instruments
            # thus subseting is not done in this variant
            # subset=subset,
            # outliers were remove prior to this pipeline execution to avert
            # the need of adjusting train/test split due to patients exclusion
            outliers=Null, **kwargs
        ),
        LowCountsFilter(verbose=verbose),
        LowVarianceFilter(verbose=verbose),
        df_keeping(StandardScaler)()
    )


def two_blocks_with_supervision(data_block: DataFrame, conditions_vector: Series):
    conditions = list(conditions_vector)
    conditions = Series(conditions, index=conditions)
    return [data_block, conditions.loc[data_block.index]]


def subset(omic, subset, outliers=None):
    df = omic[omic.columns.intersection(subset)].T
    if outliers is not None:
        of = OutliersFilter(outlier_patients=outliers, verbose=True)
        df = of.fit_transform(df)
    return df

### Uniform train-test split

In [44]:
patients_with_single_omic = Series(
    list(patients_union.difference(patients_with_both))
)

In [45]:
cm_and_tmdr_validation = patients_with_single_omic[
    patients_with_single_omic.str.contains('TMR|TMD|CM')
]

In [46]:
tms_validation = pd.concat([
    patients_union_series[patients_union_series.str.contains('TMS')],
    patients_with_single_omic[patients_with_single_omic.str.contains('CM')]
])