In [None]:
library(data.table)
library(parallel)

In [None]:
setwd("/broad/regevtmp/jklugham/HD_ST") # set wd to project directory containing all the sub folder

#### Calculate cell type likelihoods for each barcode/bin

In [None]:
calc_lk=function(ref,dat,exp=NULL,n_shuff=0){
    dat_ct=merge(dat,ref,by="gene",allow.cartesian=TRUE)
    likelihoods=dat_ct[,.(lk=sum(count*log_norm),
                          count=sum(count),
                          umi_distrib=paste0("c(",paste(count,collapse = ","),")")
                         ),by=c("x","y","ClusterName",exp)]
    
    get_emp_pval=function(lk_orig,counts,cluster,ref_data,n_shuff){
        ref_cl=ref_data[ClusterName==cluster]$log_norm
        counts=eval(parse(text=counts))
        zero_rat=sum(ref_cl==-Inf)/length(ref_cl)
        p_no0=(1-zero_rat)^length(counts)
        
        set.seed(1234)
        shuffles=lapply(rep(length(counts),n_shuff),FUN=function(x)sample(ref_cl[ref_cl!=-Inf],x,replace = FALSE))
        lk_shuff=unlist(mclapply(X=shuffles,FUN=function(x)sum(counts*x),mc.cores=1,mc.preschedule = TRUE))
        p=unlist(mclapply(X=lk_orig,FUN=function(x){sum(lk_shuff>=x)/n_shuff},mc.cores=1,mc.preschedule = TRUE))
        return(p*p_no0)
    }
    if (n_shuff!=0){
            likelihoods[lk!=-Inf,emp_pval:=get_emp_pval(lk,umi_distrib,ClusterName,ref,n_shuff),
                by=c("umi_distrib","ClusterName")]
            likelihoods[lk==-Inf,emp_pval:=1,]
        }else{
            likelihoods[,emp_pval:=NA,]
    }
    return(likelihoods)
}

#### Process the the output from calc_lk

In [None]:
process_calk_lk=function(likelihoods,Ng=Ngenes,exp=NULL){
    find_nexthighest=function(lk){
        lk_sort=sort(lk,decreasing=TRUE)
        nexth=sapply(lk,function(x){c(lk_sort[lk_sort<x],x)[1]})
        nexth[nexth==0]=NA
        return(nexth)
    }
    #cleanup previous
    likelihoods[,emp_pval_adjust_BH:=NULL,]
    likelihoods[,emp_pval_adjust_Bf:=NULL,]
    
    likelihoods[,lk_rat:=exp(lk-max(lk)),by=c("x","y",exp)]
    likelihoods[,lk_norm:=lk_rat/sum(lk_rat),by=c("x","y",exp)]
    likelihoods[,lk_rand:=count*log(1/Ng),]
    likelihoods[,second_best_ct:=ClusterName[order(-lk)][2],by=c("x","y",exp)]
    likelihoods[,emp_pval_adjust_BH:=p.adjust(emp_pval,method = "BH"),by=c("ClusterName",exp)]
    likelihoods[,emp_pval_adjust_Bf:=p.adjust(emp_pval,method = "bonferroni"),by=c("ClusterName",exp)]
    likelihoods[,bc:=paste0(x,"_",y),]
    likelihoods[,ClusterName_simpl:=ifelse(grepl("_",ClusterName),unlist(lapply(strsplit(as.character(ClusterName),"_"),"[[",2)),ClusterName),]
    
    return(likelihoods)
}

### MOB

In [None]:
thres=c(0.01,0.1) #p-value, lk_norm

In [None]:
li_norm=fread("results/li_norm.tsv")

#### 1x (hd)

In [None]:
dat=fread("MOB/CN13_D2_filtered_red_ut.tsv")
tag="li_hd"

In [None]:
dat=fread("MOB_nc/CN13_D2_unmodgtf_filtered_red_ut.tsv")
tag="unmodgtf_li_hd"

In [None]:
dat=fread("MOB_nc/CN24_D1_unmodgtf_filtered_red_ut.tsv")
tag="D1_unmodgtf_li_hd"

In [None]:
dat=fread("MOB_nc/CN24_E1_unmodgtf_filtered_red_ut.tsv")
tag="E1_unmodgtf_li_hd"

#### binned 

In [None]:
dat=fread("MOB_binned/hdst-lowres.tsv")
tag="li_low"

In [None]:
dat=fread("MOB_binned_nc/hdst-lowres.tsv")
tag="unmodgtf_li_low"

In [None]:
dat=fread("MOB_binned_nc/D1/hdst-lowres.tsv")
tag="D1_unmodgtf_li_low"

In [None]:
dat=fread("MOB_binned_nc/E1/hdst-lowres.tsv")
tag="E1_unmodgtf_li_low"

#### segmentd

In [None]:
dat=fread("MOB/CN13_D2_filtered_red_ut_segmented.tsv")
tag="li_seg"

In [None]:
dat=fread("MOB_nc/CN13_D2_unmodgtf_filtered_red_ut_segmented.tsv")
tag="unmodgtf_li_seg"

In [None]:
dat=fread("MOB_nc/CN24_D1_unmodgtf_filtered_red_ut_segmented.tsv")
tag="D1_unmodgtf_li_seg"

In [None]:
dat=fread("MOB_nc/CN24_E1_unmodgtf_filtered_red_ut_segmented.tsv")
tag="E1_unmodgtf_li_seg"

#### here actually run

In [None]:
ns=1000
system.time(
expr=likelihoods_li<-calc_lk(li_norm,dat,n_shuff = ns)
)
write.table(likelihoods_li,paste0("results/ct_likelihoods_",tag,"_fastp_",ns,".tsv"),sep="\t",quote=FALSE,row.names=FALSE)
likelihoods_li=process_calk_lk(likelihoods = likelihoods_li,Ng = 10000)
likelihoods_li[,N_ct:=sum(lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]&lk_rat>=0.8,na.rm = TRUE),by=c("x","y")]
write.table(likelihoods_li,paste0("results/ct_likelihoods_",tag,"_fastp_",ns,".tsv"),sep="\t",quote=FALSE,row.names=FALSE)

### Breast cancer

In [None]:
thres=c(0.05,0.7) #p-value, lk_norm

In [None]:
tnbc_norm=fread("results/tnbc_norm.tsv")

#### 1x (hd)

In [None]:
dat=fread("BC/CN21_BC24350_E2_filtered_red_ut.tsv")
tag="E2_tnbc_hd"

In [None]:
dat=fread("BC_nc/CN21_BC24350_E2_unmodgtf_filtered_red_ut.tsv")
tag="E2_unmodgtf_tnbc_hd"

In [None]:
dat=fread("BC_nc/CN21_BC24350_C1_unmodgtf_filtered_red_ut.tsv")
tag="C1_unmodgtf_tnbc_hd"

In [None]:
dat=fread("BC_nc/CN21_BC24350_D1_unmodgtf_filtered_red_ut.tsv")
tag="D1_unmodgtf_tnbc_hd"

#### binned 

In [None]:
dat=fread("BC_binned/hdst-lowres.tsv")
tag="E2_tnbc_low"

In [None]:
dat=fread("BC_binned_nc/hdst-lowres.tsv")
tag="E2_unmodgtf_tnbc_low"

In [None]:
dat=fread("BC_binned_nc/C1/hdst-lowres.tsv")
tag="C1_unmodgtf_tnbc_low"

In [None]:
dat=fread("BC_binned_nc/D1/hdst-lowres.tsv")
tag="D1_unmodgtf_tnbc_low"

#### segmentd

In [None]:
dat=fread("BC/CN21_BC24350_E2_filtered_red_ut_segmentd.tsv")
tag="E2_tnbc_seg"

In [None]:
dat=fread("BC/CN21_BC24350_E2_unmodgtf_filtered_red_ut_segmentd.tsv")
tag="E2_unmodgtf_tnbc_seg"

#### here actually run

In [None]:
ns=1000
system.time(
expr=likelihoods_tnbc<-calc_lk(tnbc_norm,dat,n_shuff = ns)
)
write.table(likelihoods_tnbc,paste0("results/ct_likelihoods_",tag,"_fastp_",ns,".tsv"),sep="\t",quote=FALSE,row.names=FALSE) #fastp_ns only introduced 5/15
likelihoods_tnbc=process_calk_lk(likelihoods = likelihoods_tnbc,Ng = 10000)
likelihoods_tnbc[,N_ct:=sum(lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]&lk_rat>=0.8,na.rm = TRUE),by=c("x","y")]
write.table(likelihoods_tnbc,paste0("results/ct_likelihoods_",tag,"_fastp_",ns,".tsv"),sep="\t",quote=FALSE,row.names=FALSE)