In [1]:
import scanpy as sc
import pandas as pd
import hdf5plugin

In [2]:
ADATA_PATH = "./data/8.data4degs/alzheimer_data_all_ctypes.h5ad"

In [3]:
adata = sc.read_h5ad(ADATA_PATH)
adata

AnnData object with n_obs × n_vars = 12626 × 19965
    obs: 'batch', 'condition', 'n_genes', 'prc_mt', 'prc_rb', 'n_counts', 'doublet', 'batch_num', 'cell_type'
    var: 'gene_ids'

In [4]:
sc.pp.log1p(adata)

In [5]:
adata.obs["condition"]

AAAGATGGTATAAACG-1-0    Alzheimer
AAAGCAAAGACTTTCG-1-0    Alzheimer
AAAGCAACAGCGAACA-1-0    Alzheimer
AACCATGGTCTCAACA-1-0    Alzheimer
AACCGCGCAGAAGCAC-1-0    Alzheimer
                          ...    
TTTGGTTTCGCCAGCA-1-6    Alzheimer
TTTGGTTTCGGTTAAC-1-6    Alzheimer
TTTGTCAAGATCCCAT-1-6    Alzheimer
TTTGTCAAGCTAACTC-1-6    Alzheimer
TTTGTCAGTTCAGGCC-1-6    Alzheimer
Name: condition, Length: 12626, dtype: category
Categories (2, object): ['Alzheimer', 'Control']

In [6]:
sc.tl.rank_genes_groups(adata, 
                        groupby="condition", 
                        groups=["Alzheimer"], 
                        reference="Control",
                        method="wilcoxon",
                        )

In [7]:
result = adata.uns["rank_genes_groups"]

In [8]:
degs = pd.DataFrame(
        {
            "genes": result["names"]["Alzheimer"],
            "pvals": result["pvals"]["Alzheimer"],
            "pvals_adj": result["pvals_adj"]["Alzheimer"],
            "logfoldchanges": result["logfoldchanges"]["Alzheimer"],
        }
    )

degs

Unnamed: 0,genes,pvals,pvals_adj,logfoldchanges
0,PCDH9,8.898130e-193,1.776512e-188,0.772779
1,PPP2R2B,9.468310e-150,6.301161e-146,0.825975
2,RASGEF1B,3.816608e-148,1.904965e-144,0.667339
3,MALAT1,1.991192e-137,7.950829e-134,0.820886
4,INO80D,4.343137e-119,1.445179e-115,0.584602
...,...,...,...,...
19960,MT-CO3,1.219097e-78,1.106330e-75,-0.682048
19961,MT-CYB,1.414127e-82,1.568502e-79,-0.654018
19962,MT-ATP6,3.368273e-87,4.202974e-84,-0.682908
19963,MT-ND2,1.361433e-110,3.883002e-107,-0.731047


In [75]:
row = degs.loc[degs['genes'] == 'CLU']
row

Unnamed: 0,genes,pvals,pvals_adj,logfoldchanges
2244,CLU,5.6e-05,0.000424,0.168439


In [12]:
# Check if the value is in the specified column
is_value_in_column = degs["genes"].isin(["APP"]).any()
is_value_in_column

True

In [55]:
degs_filtered = degs[
        (degs["pvals"] <= 0.05)
        & (degs["pvals"] != 0.0)
        & (degs["logfoldchanges"].abs() > 0.5)
    ].reset_index(drop=True)

degs_filtered

Unnamed: 0,genes,pvals,pvals_adj,logfoldchanges
0,PCDH9,8.898130e-193,1.776512e-188,0.772779
1,PPP2R2B,9.468310e-150,6.301161e-146,0.825975
2,RASGEF1B,3.816608e-148,1.904965e-144,0.667339
3,MALAT1,1.991192e-137,7.950829e-134,0.820886
4,INO80D,4.343137e-119,1.445179e-115,0.584602
...,...,...,...,...
2231,MT-CO3,1.219097e-78,1.106330e-75,-0.682048
2232,MT-CYB,1.414127e-82,1.568502e-79,-0.654018
2233,MT-ATP6,3.368273e-87,4.202974e-84,-0.682908
2234,MT-ND2,1.361433e-110,3.883002e-107,-0.731047


In [98]:
row = degs_filtered.loc[degs_filtered['genes'] == 'VWF']
row

Unnamed: 0,genes,pvals,pvals_adj,logfoldchanges
1395,VWF,0.00093,0.005507,0.860515


In [61]:
degs_filtered.to_csv("./data/8.data4degs/alzheimer_data_degs_v1.csv", index=False)