In [1]:
## Include packages

using DelimitedFiles
using DCAUtils
using Plots
using Statistics
using KitMSA
using FastaIO
using LinearAlgebra
using StatsBase

include("../src/read_write.jl")
include("../src/energy.jl")
include("../src/msa_tools.jl")
include("../src/extra_tools.jl")
;

In [6]:
function get_entropy(f)
    N=length(f[1,:])
    entr = zeros(Float64, N)
    for i in 1:N
        for a in 1:20
            if(f[a,i]>0)
                entr[i]-=f[a,i]*log(f[a,i])
            end
        end
    end
    return entr
end

function proba_DNA_gibbs_without_deg(k, mutated_seq, h, J,N,  T = 1)
	prob = zeros(20)
	for i in 1:20
        q_k = i
		log_proba = h[q_k, k]
 		for j in 1:N
			log_proba += J[mutated_seq[j], q_k , j, k]
		end
		prob[i] = exp(log_proba/T)
	end
	return normalize(prob,1)
    
end

function cont_dep_entr_without_deg(background, h, J)
    
    T = 1.0
    N = length(background)
    
    prob = hcat([ProbabilityWeights(proba_DNA_gibbs_without_deg(pos_mut, background, h, J, N, T)) for pos_mut in 1:N]...)
    #println(size(prob))
    return get_entropy(prob)[:]
end

function write_cde_of_chains(folder, out_path, mask, h, J, n)
    fasta_files = filter(file -> endswith(file, ".mixedDNA"), readdir(folder))
    res = []
    for i in 1:n
        chain = Int8.(fasta2matrix(joinpath(folder_path, fasta_files[i])))[mask,:]
        cde_msa = hcat([cont_dep_entr_without_deg(chain[step,:], h, J) 
                for step in 1:length(chain[:,1])]...)'
        push!(res, vec(cde_msa))
        if i %10 == 0
            println(i)
        end
    end
    writedlm(out_path, hcat(res...))
    #return hcat(res...)
end

write_cde_of_chains (generic function with 1 method)

In [4]:
path_nat = "../data/alignments/natural/PF13354_noinsert_max19gaps_nodupl_noclose.faa"
nat_MSA = Int8.(fasta2matrix(path_nat))
;

LoadError: SystemError: Genie.jl/data/alignments/natural/PF13354_noinsert_max19gaps_nodupl_noclose.faa: No such file or directory

In [5]:
PATH_PAR = "../data/model_parameters/Parameters_conv_Matteo_pc.zip"
h, J = KitMSA.extract_params(PATH_PAR;q=21)
;

In [19]:
folder_path = "../data/chains/files_amino_mixed_pse_steps60000000_seqs1000_T1.0p0.5"
fasta_files = filter(file -> endswith(file, ".mixedDNA"), readdir(folder_path))
@time matrices = [Int8.(fasta2matrix(joinpath(folder_path, file))) 
    for file in fasta_files[1:100]]
step_matrices = build_seq_matrices(matrices)
;

  0.090508 seconds (141.81 k allocations: 22.000 MiB, 53.06% compilation time)


In [21]:
path_wt_pse = "../data/alignments/refs/PSE1_pfam_DNA.fasta"

#load wt
wt_pse = join(readdlm(path_wt_pse, skipstart = 1))
L_pse = Int64(length(wt_pse)/3)
pse_DNA_seq = [wt_pse[((i-1)*3 +1):(i*3)] for i in 1:L_pse]
pse_amino_seq = [cod2amino[codon] for codon in pse_DNA_seq]
;

## Data for 2A

In [22]:
# 18 vc
# 45 v
# 77 cv
#179 c
dd = count_muts_msa(nat_MSA, pse_amino_seq)
sites = [18,19,45,46,99,100,179,180]
pse = pse_amino_seq[sites]
println([num2letter(a) for a in pse])
close_pse = nat_MSA[dd.<92,sites]
for i in [1,10,14]
    println([num2letter(a) for a in close_pse[i,:]])
end
nat = nat_MSA[dd.>140,sites]
for i in [2,7,11]
    println([num2letter(a) for a in nat[i,:]])
end

["T", "S", "E", "I", "F", "L", "G", "A"]
["M", "S", "R", "I", "F", "L", "G", "A"]
["A", "S", "L", "V", "F", "I", "G", "A"]
["L", "S", "E", "V", "F", "M", "G", "A"]
["A", "S", "K", "Y", "T", "F", "G", "D"]
["A", "S", "I", "A", "K", "A", "G", "F"]
["A", "S", "R", "L", "V", "L", "G", "G"]


## Data for 2B-C-D

In [9]:
#cde_NAT = readdlm("../cde_NAT", '\t') #if ypu have already created it
;

In [37]:
start = time()
cde_NAT = []
for i in 1:length(nat_MSA[:,1])
    seq = deepcopy(nat_MSA[i,:])
    push!(cde_NAT, cont_dep_entr_without_deg(seq, h, J))
end
fine = time()
println(fine-start)
writedlm("../data/figures/data_fig2/cde_NAT", cde_NAT)

558.7889490127563


In [21]:
cde_pse = cont_dep_entr_without_deg(pse_amino_seq, h, J)
writedlm("../data/figures/data_fig2/cde_PSE", cde_pse)

In [17]:
L = length(pse_amino_seq)
freqs_nat = reshape(compute_weighted_frequencies(Int8.(nat_MSA'), 0.2)[1], (20, L))
entr_nat = get_entropy(freqs_nat)
writedlm("../data/figures/data_fig2/cie_betalac", entr_nat)

θ = 0.2 threshold = 40.0
M = 18334 N = 202 Meff = 6875.804074759526


In [23]:
folder_path = "../data/chains/files_amino_mixed_pse_steps60000000_seqs1000_T1.0p0.5/"
fasta_files = filter(file -> endswith(file, ".mixedDNA"), readdir(folder_path))
@time matrices = [Int8.(fasta2matrix(joinpath(folder_path, file))) 
    for file in fasta_files]
step_matrices = build_seq_matrices(matrices)
;

  0.521247 seconds (765.51 k allocations: 175.079 MiB, 6.93% gc time, 9.50% compilation time)


In [14]:
L = length(pse_amino_seq)
freqs = [reshape(compute_weighted_frequencies(Int8.(MSA'),0)[1],(20, L)) 
        for MSA in step_matrices]
entr = [get_entropy(f) for f in freqs]
writedlm("../data/figures/data_fig2/evol_entr_betalac", entr)

θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff = 1000
θ = 0.0 threshold = 0.0
M = 1000 N = 202 Meff 

In [15]:
mask = zeros(112)
for i in 1:112
    if i < 8
        mask[i] = 1
    end
    if i < 20 && i > 8
        if i % 3 == 0
            mask[i] = 1
        end
    else
        if i % 5 == 0
            mask[i] = 1
        end
    end
end


mask = mask .== 1


n = 100
folder_path = "../data/chains/files_amino_mixed_pse_steps60000000_seqs1000_T1.0p0.5"
@time write_cde_of_chains(folder_path, "../data/figures/data_fig2/cde_chains", mask, h, J, 100)

10
20
30
40
50
60
70
80
90
100
 95.942394 seconds (3.93 M allocations: 739.275 MiB, 0.52% gc time, 2.72% compilation time)


In [16]:
X = [n^(3.8) for n in 1:112]
writedlm("../data/figures/data_fig2/mcmc_steps_betalac", X)