Install julia dependencies

In [None]:
using Pkg

In [None]:
Pkg.add(path="https://github.com/jmurga/MKtest.jl")
Pkg.add(["CSV", "Unzip", "DataFrames", "StatsBase", "RCall", "JLD2", "Suppressor","CondaPkg"])

In [None]:
using MKtest, CSV, Unzip, DataFrames, StatsBase, RCall, JLD2, Suppressor

In [None]:
labstorage = "/labstorage/jmurgamoreno/Immune_Adaptation_Atlas_2023/"
path = "/home/jmurgamoreno/Immune_Adaptation_Atlas_2023/";
@rput labstorage;
@rput path;

In [None]:
"""
    Function to automatize ABC-MK analyses
"""
function cell_analysis(param;file,data_tgp,rates=nothing)

    @show file
    alpha,sfs,divergence = MKtest.parse_sfs(param,data=data_tgp,gene_list=file)

    # Get cell name
    cell_name = replace(split(file,"/")[end],"_TOP_control.txt"=>"")
    cell_name = replace(cell_name,"_TOP_case.txt"=>"")

    line_name = split(file,"/")[end-2]

    if occursin("case",file)
        c_type = "case"
    else
        c_type = "control"
    end

    folder = replace(file,".txt" => "")
    mkpath(folder)

    out = @suppress begin
        try
            summstat = MKtest.summary_statistics(param,sfs,divergence,h5_file=rates,output_folder= folder,summstat_size=10^5);

            posteriors = MKtest.ABCreg(output_folder=folder,S=length(param.dac),tol=0.025,rm_summaries=true);

            out = MKtest.summary_abc(posteriors,stat="mode");

            insertcols!(out[1],1,:type=>c_type)
            insertcols!(out[1],1,:cell=>cell_name)
            insertcols!(out[1],1,:line=>line_name)

            out[1]
        catch
            DataFrame()
        end
    end
    return(out)
end

Change paths as needed!

# Estimating analytical rates

In [None]:
# Be sure you're using multiple threads to estimate the rates
df = MKtest.rates(adap,gH=[200,2000],gL=[1,10],gam_dfe=[-2000,-200],gam_flanking=[-1000,-500],alpha=[0.0,0.9],iterations=10^5,output= labstorage * "abcmk/rates_hpc.jld2");

# Cell lines

In [None]:
lines = readdir(labstorage * "Developmental/ENS_FULL_genelists_wilcox",join=true)
@rput lines

## Bootstrap cell lines genes

In [None]:
genes = CSV.read(labstorage * "annotations/MKdata_may2023.txt",DataFrame,header=false);
orthologs = CSV.read(labstorage * "annotations/mammals_orthologs.txt",DataFrame,header=true);

rename!(orthologs,:mgPropSitesAdapt => :ids);

In [None]:
param_boot = MKtest.bootstrap_parameters(factors=path*"annotations/confounding_factors_orthologs.txt",annotation=path*"annotations/ensembl_gene_coords_v109.bed")

In [None]:
for l in lines

    files = filter(x -> occursin("FULL_ENSEMBL.txt",x) ,readdir(l,join=true))

    for f ∈ files[1:2]
        @show f

        f_path,f_de = splitdir(f)

        f_de = replace(f_de,"_FULL_ENSEMBL"=>"")
        f_de = split(f_de,".")[1]

        # Extracting the top 500 DE genes by cell line.
        df = CSV.read(f,DataFrame,header=false)

        # Get orthologs only
        rename!(df,:Column1=>:ids)

        df_orthologs = innerjoin(df,orthologs,on=:ids,order=:left)

        top_de = first(df_orthologs,500)

        tmp_path = f_path * "/top_500_orthologs/"
        mkpath(tmp_path)

        CSV.write(tmp_path * f_de * ".txt",Tables.table(top_de[:,1]),header=false)

        param_boot.data   = tmp_path * f_de * ".txt"
        param_boot.output = tmp_path * f_de
        MKtest.bootstrap(param_boot)
    end
end

## ABC-MK

In [None]:
adap = MKtest.parameters(n=661,dac=[2,4,5,10,20,50,200,661],cutoff=[0.0,0.7]);

In [None]:
# Dictionaries to save results by cell lineage
results_abc = Dict{String,DataFrame}()

# Bootstrap files
lines = filter(x-> isdir(x),readdir(labstorage * "Developmental/ENS_FULL_genelists_wilcox",join=true))

for l in lines
    @show l

    files = filter(x -> occursin("case.txt",x) || occursin("control.txt",x),readdir(l*"/top_500_orthologs/",join=true))
    abcmk = map(x-> cell_analysis(adap,file=x,data_tgp=labstorage * "annotations/MKdata_may2023.txt",rates=labstorage * "abcmk/rates_hpc.jld2"),files);
    abcmk = vcat(abcmk...);
    n = countlines.(files[occursin.("case",files)]);
    insertcols!(abcmk,:n=>0);
    abcmk[abcmk.type.=="case",:n] .= n;

    results_abc[split(l,"/")[end]]    = abcmk
end
results_abc = vcat(values(results_abc)...)

In [None]:
# Save dict results
JLD2.jldopen(path * "abmkc/results_immune_abc.jld2", "a+") do file
    file["cell_lines"] = results_abc
end

## Get CI

In [None]:
# Dictionaries to save results by cell lineage
results_ci = DataFrame[]

# Bootstrap files
lines = filter(x-> isdir(x),readdir(labstorage * "Developmental/ENS_FULL_genelists_wilcox",join=true))

for l in lines
    
    files = filter(x -> isdir(x) .&& occursin("case",x),readdir(l*"/top_500_orthologs/",join=true)) .* "/out_1.0.post.gz"
    posteriors = CSV.read.(files,DataFrame,header=false)
    
    for i in eachindex(files)
        cell_name, line_name = split(files[i],"/")[[end-1,end-3]]
        cell_name = replace(cell_name,"_case"=>"")
        tmp = MKtest.summary_abc([posteriors[i]],stat="mode")
        insertcols!(tmp[2],1,:cell=>cell_name)
        insertcols!(tmp[2],1,:line=>line_name) 
        push!(results_ci,tmp[2])
        
    end
end

results_ci = vcat(results_ci...)

CSV.write(path * "Developmental/abcmk_ci.txt",results_ci)

# Adult tissues

## Bootstrap

In [None]:
param_boot = MKtest.bootstrap_parameters(factors=labstorage*"annotations/confounding_factors_orthologs.txt",annotation=labstorage*"annotations/ensembl_gene_coords_v109.bed")

In [None]:
genes = CSV.read(labstorage * "annotations/MKdata_may2023.txt",DataFrame,header=false);
orthologs = CSV.read(labstorage * "annotations/mammals_orthologs.txt",DataFrame,header=true);

rename!(orthologs,:mgPropSitesAdapt => :ids)

In [None]:
lines = readdir(labstorage * "Adult/ENS_FULL_genelists_wilcox/",join=true)

In [None]:
for l in lines

    files = filter(x -> occursin("FULL_ENSEMBL.txt",x) ,readdir(l,join=true))

    for f ∈ files
        @show f

        f_path,f_de = splitdir(f)

        f_de = replace(f_de,"_FULL_ENSEMBL"=>"")
        f_de = split(f_de,".")[1]

        # Extracting the top 500 DE genes by cell line.
        df = CSV.read(f,DataFrame,header=false)

        # Get orthologs only
        rename!(df,:Column1=>:ids)

        df_orthologs = innerjoin(df,orthologs,on=:ids,order=:left)

        top_de = first(df_orthologs,500)

        tmp_path = f_path * "/top_500_orthologs/"
        mkpath(tmp_path)

        CSV.write(tmp_path * f_de * ".txt",Tables.table(top_de[:,1]),header=false)

        param_boot.data   = tmp_path * f_de * ".txt"
        param_boot.output = tmp_path * f_de
        MKtest.bootstrap(param_boot)
    end
end

## ABC-MK

In [None]:
adap = MKtest.parameters(n=661,dac=[2,4,5,10,20,50,200,661],cutoff=[0.0,0.7]);

In [None]:
# Dictionaries to save results by cell lineage
results_abc = Dict{String,DataFrame}()
lines = readdir(labstorage * "Adult_tissues/ENS_FULL_genelists_wilcox/",join=true)

for l in lines
    @show l
    
    files = filter(x -> occursin("case.txt",x) || occursin("control.txt",x),readdir(l * "/top_500_orthologs/",join=true))
    
    abcmk = map(x-> cell_analysis(adap,file=x,data_tgp=labstorage * "/raw_data/annotations/MKdata_may2023.txt",rates=labstorage * "abcmk/rates_hpc.jld2"),files)
    
    abcmk = vcat(abcmk...)

    n = countlines.(files[occursin.("case",files)])

    insertcols!(abcmk,:n=>0)
    abcmk[abcmk.type.=="case",:n] .= n

    results_abc[i]    = abcmk
end
results_abc = vcat(values(results_abc)...)

In [None]:
# Save dict results
JLD2.jldopen(path * "results/results_immune_abc.jld2", "a+") do file
    file["adult_tissues"] = results_abc
end

In [None]:
# Dictionaries to save results by cell lineage
results_ci = DataFrame[]

# Bootstrap files
lines = filter(x-> isdir(x),readdir(labstorage * "Adult/ENS_FULL_genelists_wilcox",join=true))

for l in lines
    
    files = filter(x -> isdir(x) .&& occursin("case",x),readdir(l*"/top_500_orthologs/",join=true)) .* "/out_1.0.post.gz"
    posteriors = CSV.read.(files,DataFrame,header=false)
    
    for i in eachindex(files)
        cell_name, line_name = split(files[i],"/")[[end-1,end-3]]
        cell_name = replace(cell_name,"_case"=>"")
        tmp = MKtest.summary_abc([posteriors[i]],stat="mode")
        insertcols!(tmp[2],1,:cell=>cell_name)
        insertcols!(tmp[2],1,:line=>line_name) 
        push!(results_ci,tmp[2])
    end
end

results_ci = vcat(results_ci...)
CSV.write(path * "Adult/abcmk_ci.txt",results_ci)

# Macrophages activation

In [None]:
activation_files =  filter(x-> .!isdir(x),readdir(labstorage * "Macrophages/Output_lists/pval5x10-2_sorted",join=true));

## Bootstrap macrophages

In [None]:
genes = CSV.read(labstorage * "annotations/MKdata_may2023.txt",DataFrame,header=false);
orthologs = CSV.read(labstorage * "annotations/mammals_orthologs.txt",DataFrame,header=true);

rename!(orthologs,:mgPropSitesAdapt => :ids)

In [None]:
param_boot = MKtest.bootstrap_parameters(factors=path*"annotations/confounding_factors_orthologs.txt",annotation=path*"annotations/ensembl_gene_coords_v109.bed")

In [None]:
for f ∈ activation_files
    @show f
    
    f_path,f_de = splitdir(f)
    f_de = split(f_de,".")[1]
    
    # Extracting the top 500 DE genes by cell line.
    df = CSV.read(f,DataFrame,header=false)

    # Get orthologs only
    rename!(df,:Column1=>:ids)
    df_orthologs = innerjoin(df,orthologs,on=:ids,order=:left)
    
    top_de = first(df_orthologs,500)        
        
    tmp_path = f_path * "/top_500_orthologs/"
    mkpath(tmp_path)
        
    CSV.write(tmp_path * f_de * ".txt",Tables.table(top_de[:,1]),header=false)

    param_boot.data   = tmp_path * f_de * ".txt"
    param_boot.output = tmp_path * f_de
    MKtest.bootstrap(param_boot)

end

## ABC-MK

In [None]:
adap = MKtest.parameters(n=661,dac=[2,4,5,10,20,50,200,661],cutoff=[0.0,0.7]);

# Dictionaries to save results by cell lineage
results_abc    = Dict{String,DataFrame}()

activation_files = filter(x -> occursin("_case.txt",x) || occursin("_control.txt",x),readdir(labstorage * "Macrophages/Output_lists/pval5x10-2_sorted/top_500_orthologs",join=true));

for i in activation_files
    abcmk      = cell_analysis(adap,file=i,data_tgp=labstorage*"annotations/MKdata_may2023.txt",rates=labstorage*"rates_hpc.jld2")
    
    k = split(split(i,"/")[end],".")[1]
    n = ifelse(occursin("case",i),countlines(i),0)
    insertcols!(abcmk,:n=>n)
    results_abc[k]    = abcmk 
end

In [None]:
# Save dict results
JLD2.jldopen(path * "results/results_immune_abc.jld2", "a+") do file
    file["macrophages_activation"] = results_abc
end

In [None]:
# Dictionaries to save results by cell lineage
results_ci = DataFrame[]
activation_files = filter(x -> isdir(x) && occursin("_case",x) ,readdir(labstorage * "Macrophages/Output_lists/pval5x10-2_sorted/top_500_orthologs",join=true))  .* "/out_1.0.post.gz";

posteriors = CSV.read.(activation_files,DataFrame,header=false)

for i in eachindex(activation_files)
    cell_name, line_name = split(activation_files[i],"/")[[end-1,end-3]]
    cell_name = replace(cell_name,"_case"=>"")
    tmp = MKtest.summary_abc([posteriors[i]],stat="mode")
    insertcols!(tmp[2],1,:cell=>cell_name)
    insertcols!(tmp[2],1,:line=>line_name) 
    push!(results_ci,tmp[2])

end

results_ci = vcat(results_ci...)

@rput results_ci

R"""
results_ci = results_ci %>% separate(cell, sep = "_", into = c("cell", "time")) %>% select(-c(line)) %>% as.data.table
"""

@rget results_ci
CSV.write(path * "Macrophages/abcmk_ci.txt",results_ci)