In [6]:
## Include packages

using DelimitedFiles
using DCAUtils
using Plots
using Statistics
using KitMSA
using FastaIO
using LinearAlgebra
using StatsBase
using JLD2

include("../src/read_write.jl")
include("../src/energy.jl")
include("../src/msa_tools.jl")
include("../src/extra_tools.jl")
;

In [2]:
function get_entropy(f)
    N=length(f[1,:])
    entr = zeros(Float64, N)
    for i in 1:N
        for a in 1:20
            if(f[a,i]>0)
                entr[i]-=f[a,i]*log(f[a,i])
            end
        end
    end
    return entr
end

function proba_DNA_gibbs_without_deg(k, mutated_seq, h, J,N,  T = 1)
	prob = zeros(20)
	for i in 1:20
        q_k = i
		log_proba = h[q_k, k]
 		for j in 1:N
			log_proba += J[mutated_seq[j], q_k , j, k]
		end
		prob[i] = exp(log_proba/T)
	end
	return normalize(prob,1)
    
end

function cont_dep_entr_without_deg(background, h, J)
    
    T = 1.0
    N = length(background)
    
    prob = hcat([ProbabilityWeights(proba_DNA_gibbs_without_deg(pos_mut, background, h, J, N, T)) for pos_mut in 1:N]...)
    #println(size(prob))
    return get_entropy(prob)[:]
end

function write_cde_of_chains(folder, out_path, mask, h, J, n)
    fasta_files = filter(file -> endswith(file, ".mixedDNA"), readdir(folder))
    res = []
    for i in 1:n
        chain = Int8.(fasta2matrix(joinpath(folder_path, fasta_files[i])))[mask,:]
        cde_msa = hcat([cont_dep_entr_without_deg(chain[step,:], h, J) 
                for step in 1:length(chain[:,1])]...)'
        push!(res, vec(cde_msa))
        if i %10 == 0
            println(i)
        end
    end
    writedlm(out_path, hcat(res...))
    #return hcat(res...)
end

write_cde_of_chains (generic function with 1 method)

In [3]:
path_nat = "../data/alignments/natural/PF13354_noinsert_max19gaps_nodupl_noclose.faa"
nat_MSA = Int8.(fasta2matrix(path_nat))
;

In [4]:
PATH_PAR = "../../scra_data_beta/Parameters_conv_Matteo_pc.dat"
h, J = KitMSA.extract_params(PATH_PAR;q=21)
;

LoadError: at row 8878971, column 6 : InterruptException()

In [7]:
@load "../../scra_data_beta/local_global_beta_profile.jld2"
h = h_lp
J = zeros(21,21,202,202);

In [10]:
folder_path = "../../scra_data_beta/amino_mixed_pse_local_steps60000000_seqs100_T1.0p0.5/"
fasta_files = filter(file -> endswith(file, ".mixedDNA"), readdir(folder_path))
@time matrices = [Int8.(fasta2matrix(joinpath(folder_path, file))) 
    for file in fasta_files[1:10]]
step_matrices = build_seq_matrices(matrices)
;

 79.862881 seconds (72.07 M allocations: 16.784 GiB, 3.62% gc time, 0.06% compilation time)


In [17]:
path_wt_pse = "../data/alignments/refs/PSE1_pfam_DNA.fasta"

#load wt
wt_pse = join(readdlm(path_wt_pse, skipstart = 1))
L_pse = Int64(length(wt_pse)/3)
pse_DNA_seq = [wt_pse[((i-1)*3 +1):(i*3)] for i in 1:L_pse]
pse_amino_seq = [cod2amino[codon] for codon in pse_DNA_seq]
;

## Data for 2B-C-D

In [53]:
mask = zeros(1200001) 
idxs = round.(Int,[n^(3.8) for n in 1:112])
for i in 1:1200001
    if i in idxs
        mask[i] = 1
    end
end

In [58]:
idxs[1:10]

10-element Vector{Int64}:
    1
   14
   65
  194
  453
  906
 1627
 2702
 4228
 6310

In [62]:
new_step_m = [Int8.(zeros(10,202)) for i in 1:39]
a=0
for i in idxs[1:39]
    a+=1
    new_step_m[a] = step_matrices[idxs[a]]
end

In [64]:
L = length(pse_amino_seq)
freqs = [reshape(compute_weighted_frequencies(Int8.(MSA'),0)[1],(20, L)) 
        for MSA in new_step_m]
entr = [get_entropy(f) for f in freqs]
writedlm("../data/figures/local_data_fig2/evol_entr_betalac", entr)

θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 0.0
M = 10 N = 202 Meff = 10
θ = 0.0 threshold = 

In [65]:
X = 50 .* [n^(3.8) for n in 1:39]
writedlm("../data/figures/local_data_fig2/mcmc_steps_betalac", X)