In [1]:
using Distributed
addprocs(22)

@everywhere using Analytical, CSV, DataFrames, JLD2, ProgressMeter
PATH = "/home/jmurga/mkt/202004/"
Analytical.sourcePlotMapR(script="/home/jmurga/script.jl")

"/home/jmurga/mkt/202004/"

We have used R to get the Maximum A Posteriori following ABCreg examples.

In [2]:
adap = Analytical.parameters(N=1000,n=661)

In [3]:
h5file   = jldopen(PATH * "rawData/tgp_v1.jld2")
adap.dac = h5file["1000/661/dac"]

9-element Array{Int64,1}:
    2
    4
    5
   10
   20
   50
  200
  500
 1000

# Whole-genome

In [None]:
analysisFolder = PATH * "rawData/summStat/tgp/wg/"

In [None]:
wg = Analytical.parseSfs(sample=661,data= PATH *"rawData/tgp/wg.tsv",dac=adap.dac)

Writting SFS and divergence file in new foder analysis

In [None]:
run(`mkdir -p $analysisFolder`)
CSV.write(analysisFolder * "/sfsWg.tsv",DataFrame(wg[4],[:f,:pi,:p0]),delim='\t')
CSV.write(analysisFolder * "/divWg.tsv",DataFrame(wg[5]',[:di,:d0]),delim='\t')

Bootstrapping data following polyDFE manual and estimating summary statistics

In [None]:
@time summstat = Analytical.summaryStatsFromRates(param=adap,rates=h5file,analysisFolder=analysisFolder,summstatSize=5*10^5,replicas=100,bootstrap=true);

Performing inference

In [None]:
Analytical.ABCreg(analysis=analysisFolder,replicas=100,P=5,S=size(adap.dac,1),tol=0.002,workers=20,abcreg="/home/jmurga/ABCreg/src/reg",parallel=true);

Estimating MAP distribution

In [None]:
wgmap = Analytical.plotMap(analysis=analysisFolder,output = PATH * "results/abc/tgp/wg_map.svg");
describe(wgmap)

In [None]:
CSV.write(PATH * "results/abc/tgp/wg_map.tsv",wgmap,delim='\t')

# VIPs

In [None]:
analysisFolder = PATH * "rawData/summStat/tgp/vips/"

In [None]:
vips = Analytical.parseSfs(sample=661,data= PATH *"rawData/tgp/vips.tsv",dac=adap.dac)

Writting SFS and divergence file in new foder analysis

In [None]:
run(`mkdir -p $analysisFolder`)
CSV.write(analysisFolder * "/sfsVips.tsv",DataFrame(vips[2],[:f,:pi,:p0]),delim='\t')
CSV.write(analysisFolder * "/divVips.tsv",DataFrame(vips[3]',[:di,:d0]),delim='\t')

Bootstrapping data following polyDFE manual and estimating summary statistics

In [None]:
@time summstat = Analytical.summaryStatsFromRates(param=adap,rates=h5file,analysisFolder=analysisFolder,summstatSize=5*10^5,replicas=100,bootstrap=true);

Performing inference

In [None]:
Analytical.ABCreg(analysis=analysisFolder,replicas=100,P=5,S=size(adap.dac,1),tol=0.002,workers=20,abcreg="/home/jmurga/ABCreg/src/reg",parallel=true);

Estimating MAP distribution

In [None]:
vipsmap = Analytical.plotMap(analysis=analysisFolder,output = PATH * "results/abc/tgp/vips_map.svg");
describe(vipsmap)

In [None]:
CSV.write(PATH * "results/abc/tgp/vips_map.tsv",vipsmap,delim='\t')

# DNA VIPs

In [None]:
analysisFolder = PATH * "rawData/summStat/tgp/dna_vips/"

In [None]:
dnaVips = Analytical.parseSfs(sample=661,data= PATH *"rawData/tgp/dna_vips.tsv",dac=adap.dac)

Writting SFS and divergence file in new foder analysis

In [None]:
run(`mkdir -p $analysisFolder`)
CSV.write(analysisFolder * "/sfsDnaV.tsv",DataFrame(dnaVips[2],[:f,:pi,:p0]),delim='\t')
CSV.write(analysisFolder * "/divDnaV.tsv",DataFrame(dnaVips[3]',[:di,:d0]),delim='\t')

Bootstrapping data following polyDFE manual and estimating summary statistics

In [None]:
@time summstat = Analytical.summaryStatsFromRates(param=adap,rates=h5file,analysisFolder=analysisFolder,summstatSize=5*10^5,replicas=100,bootstrap=true);

Performing inference

In [None]:
Analytical.ABCreg(analysis=analysisFolder,replicas=100,P=5,S=size(adap.dac,1),tol=0.002,workers=20,abcreg="/home/jmurga/ABCreg/src/reg",parallel=true);

Estimating MAP distribution

In [None]:
dnaVipsMap = Analytical.plotMap(analysis=analysisFolder,output = PATH * "results/abc/tgp/dna_vips_map.svg");
describe(dnaVipsMap)

In [None]:
CSV.write(PATH * "results/abc/tgp/dna_vips_map.tsv",dnaVipsMap,delim='\t')

# RNA VIPs

In [None]:
analysisFolder = PATH * "rawData/summStat/tgp/rna_vips/"

In [None]:
rnaVips = Analytical.parseSfs(sample=661,data= PATH *"rawData/tgp/rna_vips.tsv",dac=adap.dac)

Writting SFS and divergence file in new foder analysis

In [None]:
run(`mkdir -p $analysisFolder`)
CSV.write(analysisFolder * "/sfsRnaV.tsv",DataFrame(rnaVips[2],[:f,:pi,:p0]),delim='\t')
CSV.write(analysisFolder * "/divRnaV.tsv",DataFrame(rnaVips[3]',[:di,:d0]),delim='\t')

Bootstrapping data following polyDFE manual and estimating summary statistics

In [None]:
@time summstat = Analytical.summaryStatsFromRates(param=adap,rates=h5file,analysisFolder=analysisFolder,summstatSize=5*10^5,replicas=100,bootstrap=true);

Performing inference

In [None]:
Analytical.ABCreg(analysis=analysisFolder,replicas=100,P=5,S=size(adap.dac,1),tol=0.002,workers=20,abcreg="/home/jmurga/ABCreg/src/reg",parallel=true);

Estimating MAP distribution

In [None]:
rnaVipsMap = Analytical.plotMap(analysis=analysisFolder,output = PATH * "results/abc/tgp/rna_vips_map.svg");
describe(rnaVipsMap)

In [None]:
CSV.write(PATH * "results/abc/tgp/rna_vips_map.tsv",rnaVipsMap,delim='\t')

# Non-VIPs

In [None]:
analysisFolder = PATH * "rawData/summStat/tgp_v2/nonvips/"

In [None]:
nonvips = Analytical.parseSfs(sample=661,data= PATH *"rawData/tgp/nonvips.tsv",dac=adap.dac)

Writting SFS and divergence file in new foder analysis

In [None]:
run(`mkdir -p $analysisFolder`)
CSV.write(analysisFolder * "/sfsNonvips.tsv",DataFrame(nonvips[2],[:f,:pi,:p0]),delim='\t')
CSV.write(analysisFolder * "/divNonvips.tsv",DataFrame(nonvips[3]',[:di,:d0]),delim='\t')

Bootstrapping data following polyDFE manual and estimating summary statistics

In [None]:
@time summstat = Analytical.summaryStatsFromRates(param=adap,rates=h5file,analysisFolder=analysisFolder,summstatSize=5*10^5,replicas=100,bootstrap=true);

Performing inference

In [None]:
Analytical.ABCreg(analysis=analysisFolder,replicas=100,P=5,S=size(adap.dac,1),tol=0.002,workers=20,abcreg="/home/jmurga/ABCreg/src/reg",parallel=true);

Estimating MAP distribution

In [None]:
nonvipsmap = Analytical.plotMap(analysis=analysisFolder,output = PATH * "results/abc/tgp/nonvips_map.svg");
describe(nonvipsmap)

In [None]:
CSV.write(PATH * "results/abc/tgp/nonvips_map.tsv",nonvipsmap,delim='\t')

# Plot

In [None]:
@rput wgmap
@rput vipsmap
@rput dnaVipsMap
@rput rnaVipsMap
@rput nonvipsmap
@rput PATH

## Whole-genome, VIPs, non-VIPs

In [None]:
R"""
wgmap$analysis = "Whole-genome dataset"
vipsmap$analysis = "VIPs dataset"
nonvipsmap$analysis = "Non-VIPs dataset"
dfAll = as.data.table(rbind(wgmap,vipsmap,nonvipsmap))
alphas = dfAll[,c(1:3,6)]
names(alphas) = c(paste(expression(alpha[w])),paste(expression(alpha[s])),paste(expression(alpha)),'analysis')

alphasPlot = melt(alphas)
    
tgpPlot = ggplot(alphasPlot) + geom_density(aes(x=value,fill=variable),alpha=0.75) + 
    facet_wrap(~analysis) + 
    scale_fill_manual("Posterior distribution",values = paletteSanMiguel,labels=c(expression(paste("Posterior ",alpha[w])), expression(paste("Posterior ",alpha[s])),expression(paste("Posterior ",alpha)))) + 
xlab(expression(alpha)) + 
    ylab("") + 
    theme_bw()

fwrite(df,paste0(PATH,'results/abc/tgp_map.tsv'),sep='\t')
ggsave(tgpPlot,filename=paste0(PATH,'results/abc/tgp/tgp_map.svg'),width=14,dpi=600)
ggsave(tgpPlot,filename=paste0(PATH,'results/abc/tgp/tgp_map.jpg'),width=14,dpi=600)
tgpPlot
"""

In [None]:
R"""
d = melt(dfAll,id.vars='analysis')
dfQ = d %>% group_by(analysis,variable) %>% summarize(q=paste0(round(mean(value),3)," [",quantile(round(value,3),c(0.1)),"-",quantile(round(value,3),0.9),"]"))
dfQ = reshape2::dcast(dfQ,analysis~variable)

fwrite(dfQ,paste0(PATH,'/results/abc/tgp/tgp_comparison_map_quantiles.tsv',sep='\t'))
"""

## non-VIPs vs VIPs

In [None]:
R"""
vipsmap$analysis = "VIPs dataset"
nonvipsmap$analysis = "Non-VIPs dataset"
dfAll = as.data.table(rbind(vipsmap,nonvipsmap))
alphas = dfAll[,c(1:3,6)]
names(alphas) = c(paste(expression(alpha[w])),paste(expression(alpha[s])),paste(expression(alpha)),'analysis')

alphasPlot = melt(alphas)
    
nv = ggplot(alphasPlot) + geom_density(aes(x=value,fill=variable),alpha=0.75) + 
    facet_wrap(~analysis) + 
    scale_fill_manual("Posterior distribution",values = paletteSanMiguel,labels=c(expression(paste("Posterior ",alpha[w])), expression(paste("Posterior ",alpha[s])),expression(paste("Posterior ",alpha)))) + 
xlab(expression(alpha)) + 
    ylab("") + 
    theme_bw()

fwrite(df,paste0(PATH,'results/abc/tgp_map.tsv'),sep='\t')
ggsave(nv,filename=paste0(PATH,'results/abc/tgp/nonVips_vips_map.svg'),width=14,dpi=600)
ggsave(nv,filename=paste0(PATH,'results/abc/tgp/nonVips_vips_map.jpg'),width=14,dpi=600)
nv
"""

## All VIPs

In [None]:
R"""
vipsmap$analysis = "VIPs dataset"
dnaVipsMap$analysis = "DNA-VIPs dataset"
rnaVipsMap$analysis = "RNA-VIPs dataset"
dfVips = as.data.table(rbind(vipsmap,dnaVipsMap,rnaVipsMap))
alphas = dfVips[,c(1:3,6)]
names(alphas) = c(paste(expression(alpha[w])),paste(expression(alpha[s])),paste(expression(alpha)),'analysis')

alphasPlot = melt(alphas)
    
vipsPlots = ggplot(alphasPlot) + geom_density(aes(x=value,fill=variable),alpha=0.75) + 
    facet_wrap(~analysis) + 
    scale_fill_manual("Posterior distribution",values = paletteSanMiguel,labels=c(expression(paste("Posterior ",alpha[w])), expression(paste("Posterior ",alpha[s])),expression(paste("Posterior ",alpha)))) + 
xlab(expression(alpha)) + 
    ylab("") + 
    theme_bw()

fwrite(df,paste0(PATH,'results/abc/vips_comparison_map.tsv'),sep='\t')
ggsave(vipsPlots,filename=paste0(PATH,'results/abc/tgp/vips_comparison_map.svg'),width=14,dpi=600)
ggsave(vipsPlots,filename=paste0(PATH,'results/abc/tgp/vips_comparison_map.jpg'),width=14,dpi=600)

vipsPlots
"""

In [None]:
R"""
d = melt(dfVips,id.vars='analysis')
dfQ = d %>% group_by(analysis,variable) %>% summarize(q=paste0(round(mean(value),3)," [",quantile(round(value,3),c(0.1)),"-",quantile(round(value,3),0.9),"]"))
dfQ = reshape2::dcast(dfQ,analysis~variable)

fwrite(dfQ,paste0(PATH,'/results/abc/tgp/vips_comparison_map_quantiles.tsv',sep='\t'))
"""

## DNA-VIPs vs RNA-VIPs

In [None]:
R"""
dnaVipsMap$analysis = "DNA-VIPs dataset"
rnaVipsMap$analysis = "RNA-VIPs dataset"
dfVips = as.data.table(rbind(dnaVipsMap,rnaVipsMap))
alphas = dfVips[,c(1:3,6)]
names(alphas) = c(paste(expression(alpha[w])),paste(expression(alpha[s])),paste(expression(alpha)),'analysis')

alphasPlot = melt(alphas)
    
vipsPlots = ggplot(alphasPlot) + geom_density(aes(x=value,fill=variable),alpha=0.75) + 
    facet_wrap(~analysis) + 
    scale_fill_manual("Posterior distribution",values = paletteSanMiguel,labels=c(expression(paste("Posterior ",alpha[w])), expression(paste("Posterior ",alpha[s])),expression(paste("Posterior ",alpha)))) + 
xlab(expression(alpha)) + 
    ylab("") + 
    theme_bw()

fwrite(df,paste0(PATH,'results/abc/vips_comparison_map.tsv'),sep='\t')
ggsave(vipsPlots,filename=paste0(PATH,'results/abc/tgp/dna_rna_vips.svg'),width=14,dpi=600)
ggsave(vipsPlots,filename=paste0(PATH,'results/abc/tgp/dna_rna_vips.jpg'),width=14,dpi=600)

vipsPlots
"""