In [1]:
import pandas as pd
from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
counts_df = pd.read_csv('/path/counts_matrix.csv')
counts_df.rename(columns={'Unnamed: 0': 'SampleID'}, inplace=True)

data = {
    "Sample Name": [
        "hTERT_TP1", "hTERT_TP1", "hTERT_TP1",
        "hTERT_TP5", "hTERT_TP5", "hTERT_TP5",
        "RS_PDL20_TP1", "RS_PDL20_TP1", "RS_PDL20_TP1",
        "RS_PDL50_TP8", "RS_PDL50_TP8", "RS_PDL50_TP8"
    ],
    "SRA ID": [
        "SRR14646263", "SRR14646264", "SRR14646265",
        "SRR14646272", "SRR14646273", "SRR14646274",
        "SRR14646293", "SRR14646294", "SRR14646295",
        "SRR14646311", "SRR14646312", "SRR14646313"
    ]
}
data_df = pd.DataFrame(data)

counts_df["SRA ID"] = counts_df["SampleID"].str.extract(r'/([^/]+)/quant\.sf$')[0]

metadata = counts_df.merge(data_df, on="SRA ID", how="left")
metadata = metadata[["SRA ID", "Sample Name"]].rename(columns={"Sample Name": "condition"})

gene_names = pd.DataFrame(counts_df.columns, columns=["Gene Names"])
gene_names = gene_names[~gene_names["Gene Names"].isin(["SRA ID", "SampleID"])]

counts_df = counts_df.drop(columns=["SRA ID", "SampleID"])
counts_df.columns = range(counts_df.shape[1])
counts_df = counts_df.round().astype(int)
counts_df = counts_df.apply(pd.to_numeric, errors="raise")

In [3]:
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    counts=counts_df,
    metadata=metadata,
    design_factors="condition",
    refit_cooks=True,
    inference=inference,
)

                    They will be converted to hyphens ('-').
  self.obsm["design_matrix"] = build_design_matrix(


In [4]:
dds.deseq2()

Fitting size factors...
... done in 0.01 seconds.

Fitting dispersions...
... done in 1.68 seconds.

Fitting dispersion trend curve...
... done in 0.26 seconds.

Fitting MAP dispersions...
... done in 2.10 seconds.

Fitting LFCs...
... done in 1.25 seconds.

Calculating cook's distance...
... done in 0.02 seconds.

Replacing 0 outlier genes.



In [5]:
dds.obs

Unnamed: 0,SRA ID,condition
0,SRR14646293,RS-PDL20-TP1
1,SRR14646295,RS-PDL20-TP1
2,SRR14646273,hTERT-TP5
3,SRR14646274,hTERT-TP5
4,SRR14646263,hTERT-TP1
5,SRR14646294,RS-PDL20-TP1
6,SRR14646265,hTERT-TP1
7,SRR14646264,hTERT-TP1
8,SRR14646272,hTERT-TP5


In [6]:
stat_res = DeseqStats(dds, contrast = ('condition','hTERT-TP1','hTERT-TP5'))
stat_res.summary()
res = stat_res.results_df
res['ensembl'] = gene_names['Gene Names'].values

Running Wald tests...


Log2 fold change & Wald test p-value: condition hTERT-TP1 vs hTERT-TP5
           baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
0          0.345406        2.404676  4.273789  0.562657  0.573669       NaN
1          0.000000             NaN       NaN       NaN       NaN       NaN
2        125.564443       -0.234720  0.222210 -1.056301  0.290831  0.495348
3          0.000000             NaN       NaN       NaN       NaN       NaN
4          0.351289       -2.656946  4.283156 -0.620324  0.535044       NaN
...             ...             ...       ...       ...       ...       ...
58202   1034.062358       -0.287724  0.149636 -1.922819  0.054503  0.157353
58203      0.000000             NaN       NaN       NaN       NaN       NaN
58204  53685.151131       -0.358372  0.090052 -3.979617  0.000069  0.000784
58205      0.096500        0.749608  4.425996  0.169365  0.865510       NaN
58206      0.000000             NaN       NaN       NaN       NaN       NaN

[58207 rows x 6 

... done in 1.83 seconds.



In [7]:
sigs = res[(res.padj < 0.05) & (abs(res.log2FoldChange) > 0.5)]
sigs

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,ensembl
66,620.348678,0.520711,0.170154,3.060232,2.211657e-03,1.370868e-02,ENSG00000187608
163,31.955321,3.717904,0.597249,6.225045,4.814164e-10,1.745842e-08,ENSG00000149527
170,92.900898,0.817668,0.208877,3.914599,9.055459e-05,9.962248e-04,ENSG00000157873
199,582.809356,0.549800,0.108466,5.068864,4.001969e-07,8.058738e-06,ENSG00000293478
233,42.706146,-1.010459,0.313218,-3.226056,1.255090e-03,8.627061e-03,ENSG00000069424
...,...,...,...,...,...,...,...
57542,205.122415,2.199494,0.206679,10.642062,1.898797e-26,2.646593e-24,ENSG00000147394
57562,410.778417,0.513367,0.114008,4.502902,6.703184e-06,1.029170e-04,ENSG00000213397
57591,35.659293,3.073326,0.528470,5.815515,6.044738e-09,1.736400e-07,ENSG00000102032
57664,110.739632,-1.721932,0.243238,-7.079212,1.449760e-12,8.040901e-11,ENSG00000155962
