In [1]:
## Include packages

using DelimitedFiles
using DCAUtils
using Plots
using Statistics
using KitMSA
using FastaIO
using LinearAlgebra
using StatsBase

include("../src/read_write.jl")
include("../src/energy.jl")
include("../src/msa_tools.jl")
include("../src/extra_tools.jl")
;

In [2]:
function get_entropy(f; q =20)
    N=length(f[1,:])
    entr = zeros(Float64, N)
    for i in 1:N
        for a in 1:q
            if(f[a,i]>0)
                entr[i]-=f[a,i]*log(f[a,i])
            end
        end
    end
    
    return entr
end

function proba_DNA_gibbs_without_deg(k, mutated_seq, h, J,N; q = 20, T = 1)
	prob = zeros(q)
	for i in 1:q
        q_k = i
		log_proba = h[q_k, k]
 		for j in 1:N
			log_proba += J[mutated_seq[j], q_k , j, k]
        end#
		prob[i] = exp(log_proba/T)
	end
	return normalize(prob,1)
    
end

function cont_dep_entr_without_deg(background, h, J; q =20)
    
    T = 1.0
    N = length(background)
    
    prob = hcat([ProbabilityWeights(proba_DNA_gibbs_without_deg(pos_mut, background, h, J, N,q=q,T=T)) for pos_mut in 1:N]...)
    #println(size(prob))
    return get_entropy(prob, q = q)[:]
end

function write_cde_of_chains(folder, out_path, mask, h, J, n)
    fasta_files = filter(file -> endswith(file, ".mixedDNA"), readdir(folder))
    res = []
    for i in 1:n
        chain = Int8.(fasta2matrix(joinpath(folder_path, fasta_files[i])))[mask,:]
        cde_msa = hcat([cont_dep_entr_without_deg(chain[step,:], h, J, q= 21) 
                for step in 1:length(chain[:,1])]...)'
        push!(res, vec(cde_msa))
        if i %10 == 0
            println(i)
        end
    end
    writedlm(out_path, hcat(res...))
    return hcat(res...)
end

write_cde_of_chains (generic function with 1 method)

In [8]:
[n^(3.8) for n in 1:40]

40-element Vector{Float64}:
      1.0
     13.928809012737984
     65.02206650257867
    194.0117205133309
    452.9872897985596
    905.6799459279664
   1626.9438030745541
   2702.3522012628864
   4227.869132265763
   6309.57344480193
   9063.42204273218
  12615.042993497509
  17099.55225712071
      ⋮
 360671.970030609
 410261.5041308155
 464701.88760151825
 524287.99999999965
 589322.4308034691
 660115.4305885407
 736984.8640093037
 820256.1644560841
 910262.290287913
      1.0073436825413486e6
      1.111848224026822e6
      1.2241311997312466e6

In [3]:
path_nat = "../data/alignments/natural/DBD_alignment.uniref90.cov80.noclose.a2m"
nat_MSA = Int8.(fasta2matrix(path_nat))
;

In [4]:
PATH_PAR_dbd = "../data/model_parameters/Parameters_conv_denseDBDnc.dat"
htmp, Jtmp = KitMSA.read_par_BM(PATH_PAR_dbd)
h = KitMSA.set_max_field_to_0(htmp)
J = KitMSA.symmetrize_J(Jtmp)
maximum(J)
;

In [23]:
#folder_path = "../data/chains/amino_mixed_dbd_steps600000_seqs100_T1.0p0.5/"
folder_path = "../../scra_data_beta/amino_mixed_dbd_thornton_steps1000002_seqs100_T1.0p0.5/"
fasta_files = filter(file -> endswith(file, ".mixedDNA"), readdir(folder_path))
@time matrices = [Int8.(fasta2matrix(joinpath(folder_path, file))) 
    for file in fasta_files]
step_matrices = build_seq_matrices(matrices)
;

  0.064913 seconds (97.01 k allocations: 9.506 MiB, 86.64% compilation time)


In [12]:
wt_amino_seq = [2, 10, 18, 2, 6, 3, 15, 1, 16, 6, 20, 7, 20, 6, 18, 10, 16, 2, 4, 6, 2, 9, 1, 5, 5, 9, 15, 16, 8, 14, 6, 16, 18, 4, 20, 17, 2, 13, 1, 17, 12, 4, 2, 9, 8, 3, 9, 7, 15, 15, 9, 16, 2, 14, 1, 2, 15, 10, 14, 9, 2, 5, 4, 18, 6, 11, 11, 9, 4, 6, 18, 15, 9, 3, 15, 12]
;

## Data for 2B-C-D

In [9]:
start = time()
cde_NAT = []
for i in 1:length(nat_MSA[:,1])
    seq = deepcopy(nat_MSA[i,:])
    push!(cde_NAT, cont_dep_entr_without_deg(seq, h, J, q =21))
end
fine = time()
println(fine-start)
writedlm("../data/figures/suppl_data_fig2/cde_dbd", cde_NAT)

35.94149708747864


In [22]:
cde_wt = cont_dep_entr_without_deg(wt_amino_seq, h, J, q = 21)

writedlm("../data/figures/suppl_data_fig2/cde_wt", cde_wt)

In [21]:
L = length(wt_amino_seq)
freqs_nat = reshape(compute_weighted_frequencies(Int8.(nat_MSA'), 0.2)[1], (20, L))
gap_f = 1 .- [sum(freqs_nat[:,i]) for i in 1:76]
freqs_nat_gap = rand(21,76)
freqs_nat_gap[1:20,:] = freqs_nat
freqs_nat_gap[21,:] = gap_f
entr_nat = get_entropy(freqs_nat_gap, q =21)
writedlm("../data/figures/suppl_data_fig2/cie_dbd", entr_nat)

θ = 0.2 threshold = 15.0
M = 23599 N = 76 Meff = 3133.8287015310534


In [27]:
x =  round.(Int,[n^(3.8) for n in 1:38])
writedlm("../data/figures/suppl_data_fig2/x_steps", x)
;

In [45]:
L = length(wt_amino_seq)
@time freqs = [reshape(compute_weighted_frequencies(Int8.(MSA'),21,0)[1],(20, L)) 
    for MSA in step_matrices[2:end]]
entr = [get_entropy(f) for f in freqs]
writedlm("../data/figures/suppl_data_fig2/evol_entr_dbd", entr)

θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100
θ = 0.0 threshold = 0.0
M = 100 N = 76 Meff = 100


In [46]:
mask = vcat([2*i for i in 1:8], [40*i for i in 1:40],
    [200*i for i in 10:60]) 
mask = [i for i in 1:38]

folder_path = "../data/chains/amino_mixed_dbd_steps600000_seqs100_T1.0p0.5/"
@time cde_t = write_cde_of_chains(folder_path, "../data/figures/suppl_data_fig2/cde_chains", 
    mask, h, J, 100)


10
20
30
40
50
60
70
80
90
100
  8.561492 seconds (9.43 M allocations: 1.303 GiB, 1.76% gc time, 5.35% compilation time)


2888×100 Matrix{Float64}:
 0.140968  0.140968   0.140968   0.140968   …  0.140968   0.140968  0.140968
 0.22109   0.123962   0.139459   0.125925      0.126946   0.17547   0.208315
 0.252176  0.119453   0.100324   0.173242      0.157699   0.200558  0.165232
 0.202736  0.126402   0.104107   0.128776      0.110252   0.186663  0.158632
 0.172707  0.111282   0.0964172  0.126935      0.119809   0.190531  0.133344
 0.146947  0.134373   0.0854507  0.116377   …  0.109368   0.204448  0.163031
 0.170802  0.123472   0.0854507  0.106729      0.105702   0.197324  0.247775
 0.15596   0.140183   0.0898295  0.104887      0.126668   0.200023  0.253773
 0.15596   0.104053   0.109566   0.107358      0.111729   0.147603  0.253006
 0.134043  0.0767089  0.0985499  0.101814      0.0985268  0.114384  0.244284
 0.172134  0.0814216  0.0922003  0.102559   …  0.0764265  0.109852  0.252922
 0.21307   0.0667384  0.0757489  0.11419       0.0789027  0.101284  0.256535
 0.176542  0.0585661  0.0591667  0.0981487     0.0