In [1]:
import pandas as pd
import numpy as np
from utilities import *

### Read data

In [2]:
## variant data
variant_folder = "/home/linxy29/data/CIVET/visium_breastcancer_10x/spaceranger_cellSNP0/"
breastcancer_mquad, breastcancer_barcode = load_cellsnp(variant_folder)
print("The dimension of the AD data is: ", breastcancer_mquad.ad.shape)
print("The dimension of the DP data is: ", breastcancer_mquad.dp.shape)
print("The number of mtSNPs is: ", len(breastcancer_mquad.variants))

Loaded VCF file: /home/linxy29/data/CIVET/visium_breastcancer_10x/spaceranger_cellSNP0/cellSNP.cells.vcf.gz
610 variants detected
variant names detected
The dimension of the AD data is:  (610, 4169)
The dimension of the DP data is:  (610, 4169)
The number of mtSNPs is:  610


In [3]:
## civet results
civet_res_folder = "/home/linxy29/data/CIVET/visium_breastcancer_10x/civet/q05_cell_abundance_res.csv"
civet_res = pd.read_csv(civet_res_folder)
civet_res = civet_res.dropna(axis=1, how='all')
civet_res

Unnamed: 0,variant,LR_vals.q05cell_abundance_w_sf_Cancer_Basal_SC,LR_vals.q05cell_abundance_w_sf_Cancer_Cycling,LR_vals.q05cell_abundance_w_sf_Cancer_Her2_SC,LR_vals.q05cell_abundance_w_sf_Cancer_LumA_SC,LR_vals.q05cell_abundance_w_sf_Cancer_LumB_SC,LR_vals.sum_cancer,LRT_pvals.q05cell_abundance_w_sf_Cancer_Basal_SC,LRT_pvals.q05cell_abundance_w_sf_Cancer_Cycling,LRT_pvals.q05cell_abundance_w_sf_Cancer_Her2_SC,...,Wald_pvals.q05cell_abundance_w_sf_Cancer_Her2_SC,Wald_pvals.q05cell_abundance_w_sf_Cancer_LumA_SC,Wald_pvals.q05cell_abundance_w_sf_Cancer_LumB_SC,Wald_pvals.sum_cancer,LRT_fdr.q05cell_abundance_w_sf_Cancer_Basal_SC,LRT_fdr.q05cell_abundance_w_sf_Cancer_Cycling,LRT_fdr.q05cell_abundance_w_sf_Cancer_Her2_SC,LRT_fdr.q05cell_abundance_w_sf_Cancer_LumA_SC,LRT_fdr.q05cell_abundance_w_sf_Cancer_LumB_SC,LRT_fdr.sum_cancer
0,3308T>A,,,,,,,,,,...,,,,,,,,,,
1,3309A>C,,,,,,,,,,...,,,,,,,,,,
2,3310C>A,,,,,,,,,,...,,,,,,,,,,
3,3311C>A,1.116060,7.164926,5.658235,-0.258697,0.296724,0.180680,0.290768,0.007434,0.017374,...,0.006226,0.255466,0.573937,0.666948,0.992034,0.286343,0.407355,1.000000,1.000000,1.000000
4,3312C>A,0.031209,1.459276,1.178371,3.136109,2.081446,2.306359,0.859775,0.227045,0.277688,...,0.451498,0.075086,0.152995,0.090866,1.000000,0.970115,0.986711,0.740237,0.920740,0.903023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,15758A>C,-0.072572,-0.327518,0.587927,0.697046,0.953499,1.340740,1.000000,1.000000,0.443222,...,0.276710,0.385119,0.603423,0.231444,1.000000,1.000000,1.000000,1.000000,0.997772,0.978733
606,15759T>G,2.067343,3.759955,-0.026353,7.994066,6.462778,6.368065,0.150483,0.052494,1.000000,...,0.985219,0.006933,0.017945,0.010915,0.920740,0.613234,1.000000,0.224068,0.351792,0.351792
607,15760C>A,0.020436,0.300016,1.803673,-5.086837,0.508392,0.674060,0.886325,0.583873,0.179269,...,0.145856,0.001777,0.455706,0.414038,1.000000,1.000000,0.939326,1.000000,1.000000,1.000000
608,15761G>N,-0.025495,0.694657,0.365293,0.255084,0.179458,0.257735,1.000000,0.404585,0.545581,...,0.592320,0.616686,0.739058,0.614144,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000


In [4]:
# Filter out columns containing '.q05cell_abundance_w_sf_'
filtered_columns = [col for col in civet_res.columns if '.q05cell_abundance_w_sf_' in col]

# Extract statistics names and cell types
stats_names = set()
cell_types = set()

for col in filtered_columns:
    stat_name, cell_info = col.split('.q05cell_abundance_w_sf_')
    cell_types.add(cell_info)

# Convert to sorted lists for better readability
unique_stats_names = ['LRT_fdr', 'LRT_pvals', 'Wald_pvals']
unique_cell_types = sorted(list(cell_types))
print(unique_cell_types)

['Cancer_Basal_SC', 'Cancer_Cycling', 'Cancer_Her2_SC', 'Cancer_LumA_SC', 'Cancer_LumB_SC']


In [5]:
for stat_name in unique_stats_names:
    for cell_type in unique_cell_types:
        col_name = f'{stat_name}.q05cell_abundance_w_sf_{cell_type}'
        # Select the variants with LRT_fdr > 0.05
        selected_variants = civet_res[civet_res[col_name] > 0.05]['variant']
        print(f"Number of variants with {stat_name} > 0.05 for {cell_type}: {len(selected_variants)}")
        subset_mquad, subset_barcode = select_mquad(breastcancer_mquad, breastcancer_barcode, include_variant_names=selected_variants) 
        write_mquad(breastcancer_mquad, breastcancer_barcode, f'/home/linxy29/data/CIVET/visium_breastcancer_10x/civet/civet_{stat_name}_{cell_type}005')
        selected_variants = civet_res[civet_res[col_name] > 0.1]['variant']
        print(f"Number of variants with {stat_name} > 0.1 for {cell_type}: {len(selected_variants)}")
        subset_mquad, subset_barcode = select_mquad(breastcancer_mquad, breastcancer_barcode, include_variant_names=selected_variants)
        write_mquad(breastcancer_mquad, breastcancer_barcode, f'/home/linxy29/data/CIVET/visium_breastcancer_10x/civet/civet_{stat_name}_{cell_type}01')

Number of variants with LRT_fdr > 0.05 for Cancer_Basal_SC: 546
Number of variants with LRT_fdr > 0.1 for Cancer_Basal_SC: 546
Number of variants with LRT_fdr > 0.05 for Cancer_Cycling: 546
Number of variants with LRT_fdr > 0.1 for Cancer_Cycling: 544
Number of variants with LRT_fdr > 0.05 for Cancer_Her2_SC: 546
Number of variants with LRT_fdr > 0.1 for Cancer_Her2_SC: 546
Number of variants with LRT_fdr > 0.05 for Cancer_LumA_SC: 545
Number of variants with LRT_fdr > 0.1 for Cancer_LumA_SC: 544
Number of variants with LRT_fdr > 0.05 for Cancer_LumB_SC: 546
Number of variants with LRT_fdr > 0.1 for Cancer_LumB_SC: 545
Number of variants with LRT_pvals > 0.05 for Cancer_Basal_SC: 506
Number of variants with LRT_pvals > 0.1 for Cancer_Basal_SC: 478
Number of variants with LRT_pvals > 0.05 for Cancer_Cycling: 513
Number of variants with LRT_pvals > 0.1 for Cancer_Cycling: 494
Number of variants with LRT_pvals > 0.05 for Cancer_Her2_SC: 503
Number of variants with LRT_pvals > 0.1 for Canc