In [1]:
import os
import pandas as pd
import numpy as np

import deconveil
from deconveil.default_inference import DefInference
from deconveil.dds import deconveil_fit
from deconveil.ds import deconveil_stats

import pydeseq2
from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

In [45]:
def run_pydeseq2(rna_counts, metadata, output_path, design_factors="condition", alpha=0.05):
    """
    Runs PyDESeq2 analysis and saves the results.

    Parameters:
        rna_counts (pd.DataFrame): Count matrix with genes as rows and samples as columns.
        metadata (pd.DataFrame): Metadata for the samples with design factors.
        output_path (str): Directory to save the results.
        design_factors (str): Column in metadata to use for design.
        alpha (float): Significance level for statistical tests.
    """
    os.makedirs(output_path, exist_ok=True)
    
    # Initialize DESeq2 analysis
    inference = DefaultInference(n_cpus=8)
    dds = DeseqDataSet(
        counts=rna_counts,
        metadata=metadata,
        design_factors="condition",
        refit_cooks=True,
        inference=inference,
    )

    # Fit DESeq2 model
    dds.fit_size_factors()
    dds.fit_genewise_dispersions()
    dds.fit_dispersion_trend()
    dds.fit_dispersion_prior()
    dds.fit_MAP_dispersions()
    dds.fit_LFC()
    dds.calculate_cooks()
    
    if dds.refit_cooks:
        dds.refit()

    # Perform statistical analysis
    stat_res_pydeseq = DeseqStats(dds, alpha=alpha, cooks_filter=True, independent_filter=True)
    stat_res_pydeseq.run_wald_test()

    if stat_res_pydeseq.cooks_filter:
        stat_res_pydeseq._cooks_filtering()
    stat_res_pydeseq.p_values

    if stat_res_pydeseq.independent_filter:
        stat_res_pydeseq._independent_filtering()
    else:
        stat_res_pydeseq._p_value_adjustment()

    # Log-fold change shrinkage
    stat_res_pydeseq.lfc_shrink(coeff="condition_B_vs_A")
    stat_res_pydeseq.summary()

    # Save results
    results_path = os.path.join(output_path, "res_CNnaive.csv")
    stat_res_pydeseq.results_df.to_csv(results_path)
    return(stat_res_pydeseq.results_df)


def run_deconveil(rna_counts, metadata, cnv, output_path, design_factors="condition", alpha=0.05):
    """
    Runs DeConveil analysis and saves the results.

    Parameters:
        rna_counts (pd.DataFrame): Count matrix with genes as rows and samples as columns.
        metadata (pd.DataFrame): Metadata for the samples with design factors.
        cnv (pd.DataFrame): Copy number variation (CNV) data matrix  with genes as rows and samples as columns.
        output_path (str): Directory to save the results.
        design_factors (str): Column in metadata to use for design.
        alpha (float): Significance level for statistical tests.
    """
    os.makedirs(output_path, exist_ok=True)
    
    # Initialize DeConveil inference
    inference = DefInference(n_cpus=8)

    # Fit DeConveil model
    dds = deconveil_fit(
        counts=rna_counts,
        metadata=metadata,
        cnv=cnv,
        design_factors=design_factors,
        inference=inference,
        refit_cooks=True
    )
    dds.fit_size_factors()
    dds.fit_genewise_dispersions()
    dds.fit_dispersion_trend()
    dds.fit_dispersion_prior()
    dds.fit_MAP_dispersions()
    dds.fit_LFC()
    dds.calculate_cooks()

    if dds.refit_cooks:
        dds.refit()  # Replace outlier counts

    # Statistical analysis
    stat_res_deconveil = deconveil_stats(
        dds, 
        alpha=alpha, 
        independent_filter=True, 
        cooks_filter=True
    )
    stat_res_deconveil.run_wald_test()

    if stat_res_deconveil.independent_filter:
        stat_res_deconveil._independent_filtering()
    else:
        stat_res_deconveil._p_value_adjustment()

    # Log-fold change shrinkage
    stat_res_deconveil.lfc_shrink(coeff="condition_B_vs_A")
    stat_res_deconveil.summary()

    # Save results
    results_path = os.path.join(output_path, "res_CNaware.csv")
    stat_res_deconveil.results_df.to_csv(results_path)
    return(stat_res_deconveil.results_df)

In [47]:
DATA_PATH = "/Users/katsiarynadavydzenka/Documents/PhD_AI/TCGA/BRCA/test"
rna_counts = pd.read_csv(os.path.join(DATA_PATH, "rna.csv"), index_col=0)
rna_counts = rna_counts.T
metadata = pd.read_csv(os.path.join(DATA_PATH, "metadata.csv"), index_col=0)
cnv = pd.read_csv(os.path.join(DATA_PATH, "cnv.csv"), index_col=0)
cnv = cnv.T
cnv = (cnv * 2).astype(int)

In [49]:
deconveil_output_path = "/Users/katsiarynadavydzenka/Documents/PhD_AI/deconveilCaseStudies/results/BRCA/"
run_deconveil(rna_counts, metadata, cnv, deconveil_output_path) 

pydeseq2_output_path = "/Users/katsiarynadavydzenka/Documents/PhD_AI/deconveilCaseStudies/results/BRCA/"
run_pydeseq2(rna_counts, metadata, pydeseq2_output_path)

Fitting size factors...
... done in 0.15 seconds.

Fitting dispersions...
... done in 5.22 seconds.

Fitting dispersion trend curve...
... done in 0.38 seconds.

Fitting MAP dispersions...
... done in 5.56 seconds.

Fitting LFCs...
... done in 3.47 seconds.

Calculating cook's distance...
... done in 0.50 seconds.

Replacing 1970 outlier genes.



replace_mask before filtering: (220, 1970)
Number of True values in replace_mask: 3200
replacement_counts_trimmed shape: (187, 1970)


Fitting dispersions...
... done in 0.28 seconds.

Fitting MAP dispersions...
... done in 0.59 seconds.

Fitting LFCs...
... done in 0.22 seconds.

Running Wald tests...
... done in 0.85 seconds.

Fitting MAP LFCs...
... done in 4.92 seconds.



Log2 fold change & Wald test p-value: condition B vs A
              baseMean  log2FoldChange     lfcSE       stat        pvalue  \
A1BG         12.101259       -0.248101  0.128128  -1.859431  6.296607e-02   
A1BG-AS1     77.956081       -0.040320  0.097715  -0.326320  7.441820e-01   
A2M       60702.289378        0.396316  0.099753 -13.259452  3.977901e-40   
A2M-AS1     126.644961       -1.582486  0.109889 -14.425079  3.598561e-47   
A2ML1       464.103817        2.806622  0.303290   9.488769  2.337791e-21   
...                ...             ...       ...        ...           ...   
ZYG11A      241.321578        1.391794  0.161342   8.857747  8.164764e-19   
ZYG11B     3232.777025       -0.603744  0.025856   2.988412  2.804312e-03   
ZYX       10157.755495       -0.680184  0.082936  -8.471871  2.414823e-17   
ZZEF1      4358.415548       -0.316560  0.065445  -5.467741  4.558067e-08   
ZZZ3       3497.912776       -0.331767  0.060655  -5.041385  4.621743e-07   

                  pa

Fitting size factors...
... done in 0.14 seconds.

Fitting dispersions...
... done in 5.29 seconds.

Fitting dispersion trend curve...
... done in 0.38 seconds.

Fitting MAP dispersions...
... done in 6.27 seconds.

Fitting LFCs...
... done in 2.98 seconds.

Calculating cook's distance...
... done in 0.53 seconds.

Replacing 2218 outlier genes.

Fitting dispersions...
... done in 0.39 seconds.

Fitting MAP dispersions...
... done in 0.52 seconds.

Fitting LFCs...
... done in 0.35 seconds.

Running Wald tests...
... done in 0.83 seconds.

Fitting MAP LFCs...


Log2 fold change & Wald test p-value: condition B vs A
              baseMean  log2FoldChange     lfcSE       stat        pvalue  \
A1BG         12.101259        0.216933  0.127349   1.737456  8.230666e-02   
A1BG-AS1     77.956081        0.420832  0.097706   4.359761  1.302049e-05   
A2M       60702.289378       -0.265263  0.098391 -11.738713  8.070482e-32   
A2M-AS1     126.644961       -1.196387  0.109734 -11.021924  2.995857e-28   
A2ML1       371.981713        3.397818  0.293877  10.548439  5.164780e-26   
...                ...             ...       ...        ...           ...   
ZYG11A      241.321578        1.658324  0.161510  10.455134  1.388031e-25   
ZYG11B     3289.954445       -0.357462  0.060106  -3.333155  8.586712e-04   
ZYX       10157.755495       -0.425009  0.082316  -4.406815  1.049018e-05   
ZZEF1      4358.415548       -0.439077  0.065587  -6.128182  8.888876e-10   
ZZZ3       3497.912776       -0.065804  0.060881  -0.948312  3.429706e-01   

                  pa

... done in 4.48 seconds.



Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
A1BG,12.101259,0.216933,0.127349,1.737456,8.230666e-02,1.032387e-01
A1BG-AS1,77.956081,0.420832,0.097706,4.359761,1.302049e-05,2.449636e-05
A2M,60702.289378,-0.265263,0.098391,-11.738713,8.070482e-32,7.328834e-31
A2M-AS1,126.644961,-1.196387,0.109734,-11.021924,2.995857e-28,2.282932e-27
A2ML1,371.981713,3.397818,0.293877,10.548439,5.164780e-26,3.475090e-25
...,...,...,...,...,...,...
ZYG11A,241.321578,1.658324,0.161510,10.455134,1.388031e-25,9.065731e-25
ZYG11B,3289.954445,-0.357462,0.060106,-3.333155,8.586712e-04,1.370348e-03
ZYX,10157.755495,-0.425009,0.082316,-4.406815,1.049018e-05,1.987821e-05
ZZEF1,4358.415548,-0.439077,0.065587,-6.128182,8.888876e-10,2.301828e-09
