# Differential expression of eGene by cluster cell type

Adapted from single-cell poisson mixed effects eQTL model without cell state interaction from Nathan A, et al, 2023. Single-cell eQTL models reveal dynamic T cell state dependence of disease loci. Nature.

Specifically, we adapt the single-cell linear mixed effects eQTL model without an eQTL <-> cell state interaction to quantify the variance explained in gene expression by cell type cluster assignment.

See: https://github.com/immunogenomics/sceQTL/blob/main/scripts/singlecell/linear_nostate.R

In [1]:
library(argparse)
library(lme4)
library(Matrix)
set.seed(0)

Loading required package: Matrix



## BCL2A1

In [2]:
args = {}
args$lead_snp = "15:80263217:C:T"
args$src_filepath="/data/srlab/lrumker/MCSC_Project/cna-qtl/eqtls/results/sceQTL/inputs/"
args$celltype_expr="allcells"
args$celltype_geno="Myeloid"
args$geno="/data/srlab/lrumker/MCSC_Project/cna-qtl/results/geno_munge/cis_snps/Myeloid_15:80263217:C:T_cis.DS.vcf.gz"
args$geno_ids="/data/srlab/lrumker/datasets/onek1k/geno/sample_labels/sample_list_chr15.txt"

In [3]:
# load phenotype and covariate data
expr_file=paste0(args$src_filepath,"custom_",args$celltype_geno,'_',args$lead_snp,'_csaQTL_test_',args$celltype_expr,'_eQTLs_selgene_exp.csv')
exprs_raw = read.csv(expr_file, row.names = 1) #raw UMI counts
pca_res = read.csv(paste0(args$src_filepath, args$celltype_expr, "_ePCs.csv"), row.names = 1) # gene expression PCs
cell_meta = read.csv(paste0(args$src_filepath,args$celltype_expr, "_cellmeta.csv")) # cell and donor covariates

In [4]:
gene="BCL2A1"
data = cbind(cbind(exprs_raw, pca_res), cell_meta)
data[,gene] = as.numeric(data[,gene]) 
data$id = factor(data$id)
data$age = scale(data$age)
data$nCount_RNA = scale(log(data$nCount_RNA)) # nUMI
data['E'] = data[,gene]

In [5]:
# How strongly does CD16+ monocyte cell type membership predict BCL2A1 expression?
data['isCD16pos'] = 1*(data$celltype=="CD16 Mono")
full_model <- lme4::glmer(formula = E~isCD16pos+(1|id)+(1|batch)+age+sex+nCount_RNA+percent.mt+gPC1+gPC2+gPC3+gPC4+gPC5+gPC6,
                          family = "poisson", nAGQ=0, data=data, control = glmerControl(optimizer = "nloptwrap"))
null_model <- lme4::glmer(formula = E~(1|id)+(1|batch)+age+sex+nCount_RNA+percent.mt+gPC1+gPC2+gPC3+gPC4+gPC5+gPC6,
                          family = "poisson", nAGQ=0, data= data, control = glmerControl(optimizer = "nloptwrap"))
model_lrt <- anova(null_model, full_model)
res = data.frame("GENE" = gene,
         "BETA" = summary(full_model)$coefficients[2,][1], #celltype beta
        "SE" = summary(full_model)$coefficients[2,][2], #celltype se
        "P" = model_lrt$`Pr(>Chisq)`[2])
res

Unnamed: 0_level_0,GENE,BETA,SE,P
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>
Estimate,BCL2A1,2.072498,0.0102238,0


## Not included in maunscript:  KLRC1 eQTL in NK subtypes

In [6]:
args = {}
args$lead_snp = "12:10583611:C:T"
args$src_filepath="/data/srlab/lrumker/MCSC_Project/cna-qtl/eqtls/results/sceQTL/inputs/"
args$celltype_expr="NK"
args$celltype_geno="NK"
args$geno="/data/srlab/lrumker/MCSC_Project/cna-qtl/results/geno_munge/cis_snps/NK_12:10583611:C:T_cis.DS.vcf.gz"
args$geno_ids="/data/srlab/lrumker/datasets/onek1k/geno/sample_labels/sample_list_chr12.txt"

In [7]:
# load phenotype and covariate data
expr_file=paste0(args$src_filepath,args$celltype_geno,'_',args$lead_snp,'_csaQTL_test_',args$celltype_expr,'_eQTLs_selgene_exp.csv')
exprs_raw = read.csv(expr_file, row.names = 1) #raw UMI counts
pca_res = read.csv(paste0(args$src_filepath, args$celltype_expr, "_ePCs.csv"), row.names = 1) # gene expression PCs
cell_meta = read.csv(paste0(args$src_filepath,args$celltype_expr, "_cellmeta.csv")) # cell and donor covariates

In [8]:
gene="KLRC1"
data = cbind(cbind(exprs_raw, pca_res), cell_meta)
data[,gene] = as.numeric(data[,gene]) 
data$id = factor(data$id)
data$age = scale(data$age)
data$nCount_RNA = scale(log(data$nCount_RNA)) # nUMI

# load genotype data
geno <- read.table(args$geno, row.names=1)
geno_ids = read.table(args$geno_ids)
geno_ids = as.character(geno_ids[1,])
colnames(geno) = geno_ids
geno = geno[,colnames(geno) %in% unique(data$id)] # only donors that passed QC
data['E'] = data[,gene]

In [9]:
# Just test csaQTL lead snp
G_snp = data.frame("G" = as.numeric(as.character(geno[rownames(geno)==args$lead_snp, match(data$id, colnames(geno))])))
mod_data = cbind(data, G_snp)
tryCatch({
    full_model <- lme4::glmer(formula = E~G+(1|id)+(1|batch)+age+sex+nCount_RNA+percent.mt+gPC1+gPC2+gPC3+gPC4+gPC5+gPC6+ePC1+ePC2+ePC3+ePC4+ePC5,
                          family = "poisson", nAGQ=0, data= mod_data, control = glmerControl(optimizer = "nloptwrap"))
    null_model <- lme4::glmer(formula = E~(1|id)+(1|batch)+age+sex+nCount_RNA+percent.mt+gPC1+gPC2+gPC3+gPC4+gPC5+gPC6+ePC1+ePC2+ePC3+ePC4+ePC5,
                              family = "poisson", nAGQ=0, data= mod_data, control = glmerControl(optimizer = "nloptwrap"))
    model_lrt <- anova(null_model, full_model)
    res = data.frame("SNP" = args$lead_snp, "GENE" = gene,
             "BETA" = summary(full_model)$coefficients[2,][1], #G beta
            "SE" = summary(full_model)$coefficients[2,][2], #G se
            "P" = model_lrt$`Pr(>Chisq)`[2])
    }, error=function(cond){return(NA)})
res

Unnamed: 0_level_0,SNP,GENE,BETA,SE,P
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>
Estimate,12:10583611:C:T,KLRC1,-0.3970231,0.02833444,3.4982000000000005e-41


In [10]:
# How strongly does NK subtype predict KLRC1 expression?
data['isCD56br'] = 1*(data$celltype=="NK_CD56bright")
full_model <- lme4::glmer(formula = E~isCD56br+(1|id)+(1|batch)+age+sex+nCount_RNA+percent.mt,
                          family = "poisson", nAGQ=0, data=data, control = glmerControl(optimizer = "nloptwrap"))
null_model <- lme4::glmer(formula = E~(1|id)+(1|batch)+age+sex+nCount_RNA+percent.mt,
                          family = "poisson", nAGQ=0, data= data, control = glmerControl(optimizer = "nloptwrap"))
model_lrt <- anova(null_model, full_model)
res = data.frame("GENE" = gene,
         "BETA" = summary(full_model)$coefficients[2,][1], #celltype beta
        "SE" = summary(full_model)$coefficients[2,][2], #celltype se
        "P" = model_lrt$`Pr(>Chisq)`[2])
res

Unnamed: 0_level_0,GENE,BETA,SE,P
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>
Estimate,KLRC1,1.02341,0.01156346,0


In [11]:
# retest eQTL in CD56br subset
data = data[data$celltype=="NK_CD56bright",]

# Just test lead snp
G_snp = data.frame("G" = as.numeric(as.character(geno[rownames(geno)==args$lead_snp, match(data$id, colnames(geno))])))
mod_data = cbind(data, G_snp)
tryCatch({
    full_model <- lme4::glmer(formula = E~G+(1|id)+(1|batch)+age+sex+nCount_RNA+percent.mt+gPC1+gPC2+gPC3+gPC4+gPC5+gPC6+ePC1+ePC2+ePC3+ePC4+ePC5,
                          family = "poisson", nAGQ=0, data= mod_data, control = glmerControl(optimizer = "nloptwrap"))
    null_model <- lme4::glmer(formula = E~(1|id)+(1|batch)+age+sex+nCount_RNA+percent.mt+gPC1+gPC2+gPC3+gPC4+gPC5+gPC6+ePC1+ePC2+ePC3+ePC4+ePC5,
                              family = "poisson", nAGQ=0, data= mod_data, control = glmerControl(optimizer = "nloptwrap"))
    model_lrt <- anova(null_model, full_model)
    res = data.frame("SNP" = args$lead_snp, "GENE" = gene,
             "BETA" = summary(full_model)$coefficients[2,][1], #G beta
            "SE" = summary(full_model)$coefficients[2,][2], #G se
            "P" = model_lrt$`Pr(>Chisq)`[2])
    }, error=function(cond){return(NA)})
res

Unnamed: 0_level_0,SNP,GENE,BETA,SE,P
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>
Estimate,12:10583611:C:T,KLRC1,-0.1120682,0.03416342,0.001040051


In [12]:
# Fraction of all NK cells that are CD56bright
5786/111524