In [1]:
import scanpy as sc
import pandas as pd
import hdf5plugin

In [2]:
ADATA_PATH = "./data/h5ad_concat/alzheimer_data_concat_v1.h5ad"

In [3]:
adata = sc.read_h5ad(ADATA_PATH)
adata

AnnData object with n_obs × n_vars = 37380 × 22915
    obs: 'condition', 'n_genes', 'prc_mt', 'prc_rb', 'n_counts', 'batch'
    var: 'gene_ids'

In [4]:
sc.pp.log1p(adata)

In [6]:
sc.tl.rank_genes_groups(adata, 
                        groupby="condition", 
                        groups=["disease"], 
                        reference="control",
                        method="wilcoxon",
                        )

In [7]:
result = adata.uns["rank_genes_groups"]

In [8]:
degs = pd.DataFrame(
        {
            "genes": result["names"]["disease"],
            "pvals": result["pvals"]["disease"],
            "pvals_adj": result["pvals_adj"]["disease"],
            "logfoldchanges": result["logfoldchanges"]["disease"],
        }
    )

degs

Unnamed: 0,genes,pvals,pvals_adj,logfoldchanges
0,XIST,0.0,0.0,11.588943
1,LRMDA,0.0,0.0,3.178879
2,ITPR2,0.0,0.0,2.146333
3,FOXN3,0.0,0.0,1.901910
4,SFMBT2,0.0,0.0,2.358226
...,...,...,...,...
22910,SLC24A2,0.0,0.0,-4.222770
22911,MT-ND2,0.0,0.0,-3.136454
22912,MT-ND4,0.0,0.0,-2.850127
22913,IL1RAPL1,0.0,0.0,-4.571177


In [9]:
degs_filtered = degs[
        (degs["pvals"] <= 0.05)
        & (degs["pvals"] != 0.0)
        & (degs["logfoldchanges"].abs() > 0.5)
    ].reset_index(drop=True)

degs_filtered

Unnamed: 0,genes,pvals,pvals_adj,logfoldchanges
0,CD74,8.566901e-309,4.895524e-307,2.703661
1,IKZF1,1.041879e-308,5.938968e-307,1.932392
2,LYST,2.103674e-307,1.193210e-305,1.141618
3,SSBP2,3.679205e-306,2.066397e-304,0.885069
4,SH3RF3,1.303389e-305,7.302486e-304,1.010999
...,...,...,...,...
5628,FUT8,3.588293e-306,2.020288e-304,-0.941282
5629,SCD5,1.876496e-306,1.059111e-304,-0.965855
5630,SLC22A23,7.292521e-307,4.126126e-305,-1.078225
5631,TARSL2,1.523884e-307,8.664962e-306,-1.656548


In [10]:
degs_filtered.to_csv("./data/deg_data/alzheimer_data_degs.csv", index=False)