## Import and set parameters and sequences

In [1]:
# Reset to use Julia's internal Conda Python
ENV["PYTHON"] = "" 
using Pkg
Pkg.build("PyCall")  # Rebuild PyCall to use the internal Python
using Revise, Genie, ArDCA, DelimitedFiles, DCAUtils, JLD2, PyPlot, Statistics, LinearAlgebra

@load "../data_paper_dense.jld2" 

q=21;fs = [14, 35, 595, 13354];


sil_msas = []; steps = [10^5, 10^5, 10^5, 10^6]

for i in 1:length(fs)
L = size(hs[i],2)
start_msa = Int8.(rand(1:q,L,10^5)); 
    @time mcmc = run_evolution(start_msa, 
    Float64.(hs[i]), 
    Float64.(Js[i]),
    p = 0.5, 
    temp = 1., 
    N_steps = steps[i],  
    verbose = false);

    push!(sil_msas, mcmc.msa)
end



seq_paths = ["../DataAttentionDCA/data/PF00014/PF00014_mgap6.fasta.gz",
    "../DataAttentionDCA/data/PF00035/PF00035_full.fasta",
    "../DataAttentionDCA/data/PF00595/PF00595_mgap6.fasta.gz",
    "../DataAttentionDCA/data/PF13354/PF13354_wo_ref_seqs.fasta.gz"]

fs = [14, 35, 595, 13354];
nat_msas = [read_fasta_alignment(seq_paths[i], 0.9) for i in 1:length(fs)]

f1s_nat = []; f2s_nat = []; f1s = []; f2s = []; 
c1 = [];
c2 = [];
c_conn = [];
for i in 1:length(fs)
    f1_nat, f2_nat = compute_weighted_frequencies(nat_msas[i], 22, 0.2)
    f1, f2 = compute_weighted_frequencies(sil_msas[i], 22, 0.)
    push!(f1s, f1)
    push!(f2s, f2)
    push!(f1s_nat, f1_nat)
    push!(f2s_nat, f2_nat)
    push!(c1, cor(f1[:], f1_nat[:]))
    push!(c2, cor(f2[:], f2_nat[:]))
end


[c1 c2]
    
@save "../new_data_generativity_dense_plm.jld2" f1s f2s f1s_nat f2s_nat c1 c2


sil_msas = [];

for i in 1:4
L = size(hs[i],2)
start_msa = Int8.(rand(1:q,L,10^5)); 
    @time mcmc = run_evolution(start_msa, 
    Float64.(hs_fix_V[i]), 
    Float64.(Js_fix_V[i]),
    p = 0.5, 
    temp = 1., 
    N_steps = steps[i],  
    verbose = false);

    push!(sil_msas, mcmc.msa)
end

f1s_nat = []; f2s_nat = []; f1s_fix_V = []; f2s_fix_V = []; 
c1_fix_V = [];
c2_fix_V = [];
c_conn = [];
for i in 1:length(fs)
    f1_nat, f2_nat = compute_weighted_frequencies(nat_msas[i], 22, 0.2)
    f1, f2 = compute_weighted_frequencies(sil_msas[i], 22, 0.)
    push!(f1s_fix_V, f1)
    push!(f2s_fix_V, f2)
    push!(f1s_nat, f1_nat)
    push!(f2s_nat, f2_nat)
    push!(c1_fix_V, cor(f1[:], f1_nat[:]))
    push!(c2_fix_V, cor(f2[:], f2_nat[:]))
end


[c1_fix_V c2_fix_V]
    
@save "../new_data_generativity_dense_plm_fix_V.jld2" f1s_fix_V f2s_fix_V f1s_nat f2s_nat c1_fix_V c2_fix_V



[32m[1mPrecompiling[22m[39m Genie
[32m  ✓ [39mGenie
  1 dependency successfully precompiled in 29 seconds. 179 already precompiled.


LoadError: SystemError: opening file "../model_natural_PSE.dat": No such file or directory

In [None]:
ENV["PYTHON"] = "" 
using Pkg
Pkg.build("PyCall")
using JLD2, PyPlot, LinearAlgebra

@load "../data_generativity_dense_plm.jld2"
@load "../data_generativity_dense_plm_fix_V.jld2"
namess = ["PF00014", "PF00035", "PF00595", "PF13354"];

close("all")
axis_font = 12
fig, axs = subplots(2, 2, figsize=(8, 8))

idx = 1;
for i in 1:2
    for j in 1:2
        axs[i,j].scatter(f1s_nat[idx][:], f1s[idx][:], 
            label = "Dense Model plm, Pearson = $(round(c1[idx], digits = 2))")
        axs[i,j].scatter(f1s_nat[idx][:], f1s_fix_V[idx][:], 
            label = "Dense Model plm fix V, Pearson = $(round(c1_fix_V[idx], digits = 2))")
        axs[i,j].plot([0.,1.], [0., 1.], color = "grey", linestyle = "--")
        axs[i,j].set_title(namess[idx],fontsize = axis_font)
        axs[i,j].legend(fontsize = axis_font -4, loc="upper left")
        idx += 1
    end
end
fig.supxlabel("Natural frequencies", fontsize = axis_font+10)
fig.supylabel("Simulated frequencies", fontsize = axis_font+10)
fig.suptitle("One-Point Frequencies", fontsize = axis_font+15)



savefig("../1point_plm.png", dpi = 300)


axis_font = 12

close("all")
fig, axs = subplots(2, 2, figsize=(8, 8))

idx = 1;
for i in 1:2
    for j in 1:2
        axs[i,j].scatter(f2s_nat[idx][:], f2s[idx][:], label = "Dense Model plm, Pearson = $(round(c2[idx], digits = 2))")
        axs[i,j].scatter(f2s_nat[idx][:], f2s_fix_V[idx][:], 
            label = "Dense Model plm fix V, Pearson = $(round(c2_fix_V[idx], digits = 2))")
        axs[i,j].plot([0.,1.], [0., 1.], color = "grey", linestyle = "--")
        axs[i,j].set_title(namess[idx],fontsize = axis_font)
        axs[i,j].legend(fontsize = axis_font-4, loc="upper left")
        idx += 1
    end
end
fig.supxlabel("Natural frequencies", fontsize = axis_font+10)
fig.supylabel("Simulated frequencies", fontsize = axis_font+10)
fig.suptitle("Two-Point Frequencies", fontsize = axis_font+15)



savefig("../2point_plm.png", dpi = 300)

In [None]:
@load "../data_generativity_dense_plm.jld2" 
@load "../data_generativity_dense_plm_fix_V.jld2"

conn_nat = [triu(f2s_nat[idx] - f1s_nat[idx] * f1s_nat[idx]', 21) for idx in 1:4];
conn = [triu(f2s[idx] - f1s[idx] * f1s[idx]', 21) for idx in 1:4];
conn_fix_V = [triu(f2s_fix_V[idx] - f1s_fix_V[idx] * f1s_fix_V[idx]', 21) for idx in 1:4];


cc = [cor(conn[i][:], conn_nat[i][:]) for i in 1:4]


cc_fix_V = [cor(conn_fix_V[i][:], conn_nat[i][:]) for i in 1:4]

In [None]:
@load "../data_generativity_dense.jld2"
@load "../data_generativity_ar.jld2"


conn_ar = []; conn_ardca = []; conn_nat = [];

for i in 1:4
    a = Genie.conn_corr(f1s[i],f2s[i])
    b = Genie.conn_corr(f1s_ar[i],f2s_ar[i])
    c = Genie.conn_corr(f1s_nat[i], f2s_nat[i])
    push!(conn_ar, a)
    push!(conn_ardca, b)
    push!(conn_nat, c)
end

@load "../new_data_generativity_dense_plm.jld2"
@load "../new_data_generativity_dense_plm_fix_V.jld2"

conn_plm = []; conn_plm_fix_V = [];

for i in 1:4
    a = Genie.conn_corr(f1s[i],f2s[i])
    b = Genie.conn_corr(f1s_fix_V[i],f2s_fix_V[i])
    push!(conn_plm, a)
    push!(conn_plm_fix_V, b)
end

@load "../data_generativity_fix_V_ar.jld2"

conn_ar_fix_V = [];

for i in 1:4
    b = Genie.conn_corr(f1s_fix_V_ar[i],f2s_fix_V_ar[i])
    push!(conn_ar_fix_V, b)
end


corr_ar = [cor(conn_nat[i][:], conn_ar[i][:]) for i in 1:4]
corr_ardca = [cor(conn_nat[i][:], conn_ardca[i][:]) for i in 1:4]
corr_ar_fix_V = [cor(conn_nat[i][:], conn_ar_fix_V[i][:]) for i in 1:4]
corr_plm = [cor(conn_nat[i][:], conn_plm[i][:]) for i in 1:4]
corr_plm_fix_V = [cor(conn_nat[i][:], conn_plm_fix_V[i][:]) for i in 1:4]

[corr_ar corr_ar_fix_V corr_plm corr_plm_fix_V corr_ardca]

@save "../all_generativity_results_4_fams.jld2" corr_ar corr_ardca corr_ar_fix_V corr_plm corr_plm_fix_V




## Only family 595

In [None]:
ENV["PYTHON"] = "" 
using Pkg
Pkg.build("PyCall")  # Rebuild PyCall to use the internal Python
using Revise, Genie, ArDCA, DelimitedFiles, DCAUtils, JLD2, PyPlot, Statistics, LinearAlgebra

@load "../pars_595_low_regularization.jld2"

q, L = size(h_plm);

start_msa = Int8.(rand(1:q,L,10^5)); 
    
@time mcmc_plm = run_evolution(start_msa, 
    Float64.(h_plm), 
    Float64.(J_plm),
    p = 0.5, 
    temp = 1., 
    N_steps = 10^3,  
    verbose = false);

msa_plm = mcmc_plm.msa;

start_msa = Int8.(rand(1:q,L,10^5)); 
    
@time mcmc_plm_fix_V = run_evolution(start_msa, 
    Float64.(h_plm_fix_V), 
    Float64.(J_plm_fix_V),
    p = 0.5, 
    temp = 1., 
    N_steps = 10^3,  
    verbose = false);

msa_plm_fix_V = mcmc_plm_fix_V.msa;

@save "../equil_seqs_595_msa.jld2"  msa_plm msa_plm_fix_V




