In [1]:
ENV["PYTHON"] = "" 
using Pkg
Pkg.build("PyCall")  # Rebuild PyCall to use the internal Python
using Revise, Genie, KitMSA, PyPlot, JLD2, Statistics, StatsBase, DCAUtils, DelimitedFiles, CSV, DataFrames

MSA_train = Int8.(KitMSA.fasta2matrix("../files_nobu_evolution_exp/natural_seqs/B1_all_Sevan_hmm03_max010rowgaps_max075flankcolgaps_max080wtid.fasta")')
MSA_train2 = Int8.(KitMSA.fasta2matrix("../files_nobu_evolution_exp/natural_seqs/B1_all_Sevan_hmm03_max010rowgaps_max075flankcolgaps.fasta")')
@load "../data_Genie/pars_beta_B1_integrated.jld2"

w_fra = readdlm("../data_Genie/Weights_leonardo.dat")

L,M = size(MSA_train); q = 21;
#VIM2 = Int.(KitMSA.fasta2matrix("../files_nobu_evolution_exp/wts/VIM2_V185I_hmmstruct_symfrac03_max075flankcolgaps.fasta"));

# NDM-1 (there is a DMS of this sequence)
#NDM1 = Int.(KitMSA.fasta2matrix("../files_nobu_evolution_exp/wts/NDM1_hmmstruct_symfrac03_max075flankcolgaps.fasta"));

VIM2 = Int.(KitMSA.fasta2matrix("../files_nobu_evolution_exp/wts/VIM2_V185I_equal_length_hmmstruct_symfrac03_max075flankcolgaps.fasta"));

# NDM-1 (there is a DMS of this sequence)
NDM1 = Int.(KitMSA.fasta2matrix("../files_nobu_evolution_exp/wts/NDM1_equal_length_hmmstruct_symfrac03_max075flankcolgaps.fasta"));


dms_map = CSV.read("metallo_beta/Dataframe_matching_mutations_model_removed100_9_2024.txt", DataFrame; delim=',');
pos_map = CSV.read("metallo_beta/simple_matching_positions_DCAmodel_experiments_VIM_NDM.csv", DataFrame);
revers = CSV.read("metallo_beta/DCA effect reversion latest model.txt", DataFrame; delim='\t')
double = CSV.read("metallo_beta/DCA effect double mutants.txt", DataFrame; delim='\t')

ndm_r2mod = Dict(pos_map[:, 1] .=> pos_map[:, 2]);
ndm_mod2r = Dict(pos_map[:, 2] .=> pos_map[:, 1]);

vim_r2mod = Dict(pos_map[:, 4] .=> pos_map[:, 5]);
vim_mod2r = Dict(pos_map[:, 5] .=> pos_map[:, 4]);

mod_ndm2vim = Dict(pos_map[:, 2] .=> pos_map[:, 5]);
mod_vim2ndm = Dict(pos_map[:, 5] .=> pos_map[:, 2]);





[32m[1mPrecompiling[22m[39m Genie
[32m  ✓ [39mGenie
  1 dependency successfully precompiled in 22 seconds. 179 already precompiled.


## Checks and preliminary tests on the family

In [None]:
L,M = size(MSA_train2)
ee = Genie.energy(MSA_train2, h, J); dd = zeros(M); 
@time for i in 1:M
    dd[i] = mean(ham_dist(MSA_train2[:,i], MSA_train2))
end

cor(ee, dd)

close("all"); plt.scatter(ee, dd); plt.xlabel("DCA energy"); plt.ylabel("Hamming from rest of MSA"
    ); savefig("../en_vs_ham_intra_metallo_beta.png")

In [None]:
pc_nat, pc_sil = Genie.check_pca(Int8.(MSA_train2), Int8.(hcat([VIM2, NDM1]...)))

close("all")
plt.scatter(pc_nat[:,1], pc_nat[:,2], label = "Natural sequences")
plt.scatter(pc_sil[1,1], pc_sil[1,2], label = "VIM2")
plt.scatter(pc_sil[2,1], pc_sil[2,2], label = "NDM1")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
savefig("../pca_metallo_beta.png")

In [None]:
ee = Genie.energy(MSA_train2, h, J); 
en_vim = Genie.energy(VIM2, h, J, L);
en_ndm = Genie.energy(NDM1, h, J, L);

close("all"); plt.hist(ee, linewidth = 3.0, histtype = "step", density = true, label = "nat"); 
plt.plot([en_vim, en_vim], [0., 10^-2], linewidth = 3.0, label = "VIM2");
plt.plot([en_ndm, en_ndm], [0., 10^-2], linewidth = 3.0, label = "NDM1");
plt.legend(); plt.xlabel("DCA energy");plt.yscale("log"); savefig("../ens_metallo_beta_nat.png")

In [16]:
test_double_mut = zeros(8)
for i in 1:8
    mut1 = double.Mutation_1[i]
    mut2 = double.Mutation_2[i]
    single_mut1 = revers[revers.var"Mutation NDM-1" .== mut1, 2][1]
    single_mut2 = revers[revers.var"Mutation NDM-1" .== mut2, 2][1]
    test_double_mut[i] = single_mut1 + single_mut2
end

test_double_mut .- double.Additive_effect

8-element Vector{Float64}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [None]:
function entropy(p_ij; pseudocount=1e-12)
    # Add pseudocount and renormalize
    p_smoothed = p_ij .+ pseudocount
    p_smoothed ./= sum(p_smoothed)  # Normalize to ensure it sums to 1

    # Compute entropy
    H = 0.0
    for p in p_smoothed
        H -= p * log2(p)
    end
    return H
end


ps = []; entr = []; ps_max = [];
for i in 1:L 
    for j in i+1:L
        p2 = Genie.dist_proba_2_point(i, j, VIM2, h, J, L, q, 1.)
        push!(ps, p2[VIM2[i], VIM2[j]])
        push!(ps_max, maximum(p2))
        push!(entr, entropy(p2))      
    end 
end

close("all"); plt.hist(ps_max, histtype ="step", linewidth = 3.5, 
    density = true, label = "maximum Pij(a,b) in VIM-2"
    ); plt.hist(ps, histtype ="step", linewidth = 3.5, 
    density = true, label = "Pij(a*,b*) of VIM-2 residues"
    );plt.legend();savefig("../ps_VIM.png")

close("all"); plt.hist(entr, histtype ="step", linewidth = 3.5, density = true
    ); plt.xlabel("Entropy of Pij(a,b) in VIM-2");savefig("../entr_ij_VIM.png")
close("all"); plt.scatter(ps, ps_max, alpha = 0.1); plt.plot([0,1],[0,1], linestyle = "--",
    color = "black"); plt.xlabel("Pij(a*,b*) of VIM-2 residues"
);plt.ylabel("maximum Pij(a*,b*) in VIM-2"); savefig("../ps_VIM_scatter.png")


## Compare algorithms

In [15]:

#different temps

singles = [];doubles = []; temps = [0.2 * i for i in 1:5]; n_subs = 130; for i in 1:5
    mut1 = double.Mutation_1[1]
    mut2 = double.Mutation_2[1]
   
    a1i = KitMSA.letter2num(first(mut1))
    a2i = KitMSA.letter2num(first(mut2))
    
    a1f = KitMSA.letter2num(last(mut1))
    a2f = KitMSA.letter2num(last(mut2))
    m = match(r"\d+", mut1)
    site1 = ndm_r2mod[parse(Int, m.match)]
    m = match(r"\d+", mut2)
    site2 = ndm_r2mod[parse(Int, m.match)]
        
    alg = Genie.run_Dkl_path_double(site1, site2, n_subs, NDM1, 
    VIM2, h, J, temp = temps[i]); 
    
    push!(doubles, alg)
end


close("all")
for i in 1:length(doubles)
    plt.plot(doubles[i].ps_ij, label = "T = $(round(temps[i], digits = 2))")
end 
plt.xlabel("# reversions")
plt.ylabel("P_ij")
plt.legend()
plt.title("$(double.Mutation_1[1]) + $(double.Mutation_2[1])")
savefig("../diff_T_p_ij_muts.png")


close("all")
for i in 1:length(doubles)
    plt.plot(doubles[i].ens, label = "T = $(round(temps[i], digits = 2))")
end 
plt.xlabel("# reversions")
plt.ylabel("DCA energy")
plt.legend()
plt.title("$(double.Mutation_1[1]) + $(double.Mutation_2[1])")
savefig("../diff_T_ens_muts.png")






LoadError: UndefVarError: `seq2` not defined

In [None]:
singles = [];doubles = []; T = 1.; n_subs = 130; for i in 1:8
    mut1 = double.Mutation_1[i]
    mut2 = double.Mutation_2[i]
    m = match(r"\d+", mut1)
    site1 = ndm_r2mod[parse(Int, m.match)]
    m = match(r"\d+", mut2)
    site2 = ndm_r2mod[parse(Int, m.match)]
    println("$(double.Mutation_1[i]) + $(double.Mutation_2[i])")        
    alg = Genie.run_Dkl_path_double(site1, site2, n_subs, NDM1, 
    VIM2, h, J, temp = T); 
    alg_s = Genie.run_pij_path_double(site1, site2, n_subs, NDM1, 
          VIM2, h, J, temp = T); 
    push!(singles, alg_s)
    push!(doubles, alg)
end


close("all")
for i in 1:length(singles)
    plt.plot(doubles[i].ps_ij, label = "$(double.Mutation_1[i]) + $(double.Mutation_2[i])")
end 
plt.xlabel("# reversions")
plt.ylabel("P_ij")
plt.legend()
savefig("../p_ij_muts.png")


close("all")
for i in 1:length(singles)
    plt.plot(doubles[i].ens, label = "$(double.Mutation_1[i]) + $(double.Mutation_2[i])")
end 
plt.xlabel("# reversions")
plt.ylabel("DCA energy")
plt.legend()
savefig("../ens_muts.png")



seqs = hcat([doubles[i].seqs[40] for i in [1,4,5,6]]...);


ham_dist(NDM1, seqs)

dists = zeros(4,4);
for i in 1:size(seqs,2)
    for j in i+1:size(seqs,2)
        dists[i,j] = ham_dist(seqs[:,i], seqs[:,j])
    end
end


dists ./ 40


color_names = ["red", "orange", "yellow", "green", "blue", "purple", "brown", "gray"]


close("all")
for i in 1:length(singles)
    plt.plot(doubles[i].ps_ij,  c = color_names[i],  label = "Dkl P_ij $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
    plt.plot(singles[i].ps_ij, linestyle = "--",  c = color_names[i], label = "Only P_ij $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
end 
plt.xlabel("# reversions")
plt.ylabel("P_ij")
plt.legend()
savefig("../single_p_ij_muts.png")


close("all")
for i in 1:length(singles)
    plt.plot(doubles[i].ens, c = color_names[i], label = "Dkl P_ij $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
    plt.plot(singles[i].ens, linestyle = "--",  c = color_names[i], label = "Only P_ij $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
end 
plt.xlabel("# reversions")
plt.ylabel("DCA energy")
plt.legend()
savefig("../single_ens_muts.png")

In [None]:
energies = [];doubles = []; T = 1.; n_subs = 130; for i in 1:8
    mut1 = double.Mutation_1[i]
    mut2 = double.Mutation_2[i]
   
    a1i = KitMSA.letter2num(first(mut1))
    a2i = KitMSA.letter2num(first(mut2))
    
    a1f = KitMSA.letter2num(last(mut1))
    a2f = KitMSA.letter2num(last(mut2))
    m = match(r"\d+", mut1)
    site1 = ndm_r2mod[parse(Int, m.match)]
    m = match(r"\d+", mut2)
    site2 = ndm_r2mod[parse(Int, m.match)]
    
    #=println(NDM1[site1] - a1i)
    println(NDM1[site2] - a2i)
    
    println(VIM2[site1] - a1f)
    println(VIM2[site2] - a2f)=#
    
    println("$(double.Mutation_1[i]) + $(double.Mutation_2[i])")
    
    ndm_p = Genie.dist_proba_2_point(site1, site2, NDM1, h, J, L, q, 1.)
    vim_p = Genie.dist_proba_2_point(site1, site2, VIM2, h, J, L, q, 1.)
    
    println("NDM1 i->f")
    println([ndm_p[a1i, a2i], ndm_p[a1f, a2f]])
    println("VIM2 i->f")
    println([vim_p[a1i, a2i], vim_p[a1f, a2f]])
        
    alg = Genie.run_Dkl_path_double(site1, site2, n_subs, NDM1, 
    VIM2, h, J, temp = T); alg_s = Genie.run_Dkl_path_energy(site1, site2, n_subs, NDM1, 
           VIM2, h, J, temp = T); 
    push!(energies, alg_s)
    push!(doubles, alg)
end

color_names = ["red", "orange", "yellow", "green", "blue", "purple", "brown", "gray"]


close("all")
for i in 1:length(singles)
    plt.plot(doubles[i].ps_ij, c = color_names[i], label = "P-ij $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
    plt.plot(energies[i].ps_ij, c = color_names[i], linestyle = "--", label = "En $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
end 
plt.xlabel("# reversions")
plt.ylabel("P_ij")
plt.legend()
savefig("../comparison_p_ij_muts.png")


close("all")
for i in 1:length(singles)
    plt.plot(doubles[i].ens, c = color_names[i], label = "P-ij $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
    plt.plot(energies[i].ens, c = color_names[i], linestyle = "--", label = "En $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
end 
plt.xlabel("# reversions")
plt.ylabel("DCA energy")
plt.legend()
savefig("../comparison ens_muts.png")



In [None]:
mixed = [];doubles = []; regs = [0.02*i for i in 1:5]; n_subs = 139; 
mut1 = double.Mutation_1[7]
    mut2 = double.Mutation_2[7]
   
    a1i = KitMSA.letter2num(first(mut1))
    a2i = KitMSA.letter2num(first(mut2))
    
    a1f = KitMSA.letter2num(last(mut1))
    a2f = KitMSA.letter2num(last(mut2))
    m = match(r"\d+", mut1)
    site1 = ndm_r2mod[parse(Int, m.match)]
    m = match(r"\d+", mut2)
    site2 = ndm_r2mod[parse(Int, m.match)]

for i in 1:length(regs)
    
        
    alg = Genie.run_Dkl_path_mixed(site1, site2, n_subs, NDM1, 
    VIM2, h, J, reg = regs[i]); 
    push!(mixed, alg)
    #push!(doubles, alg)
end

alg2 = Genie.run_Dkl_path_double(site1, site2, n_subs, NDM1, 
    VIM2, h, J); 


close("all")
for i in 1:length(regs)
    plt.plot(mixed[i].ps_ij, label = "reg = $(regs[i])")
end
plt.plot(alg2.ps_ij, color = "black", linewidth = 4, label = "No reg")
plt.xlabel("# reversions")
plt.ylabel("P_ij")
plt.legend()
plt.title("$(double.Mutation_1[1]) + $(double.Mutation_2[1])")
savefig("../diff_reg_p_ij_muts.png")


close("all")
for i in 1:length(regs)
    plt.plot(mixed[i].ens, label = "reg = $(regs[i])")
end 
plt.plot(alg2.ens, color = "black", linewidth = 4, label = "No reg")
plt.xlabel("# reversions")
plt.ylabel("DCA energy")
plt.legend()
plt.title("$(double.Mutation_1[1]) + $(double.Mutation_2[1])")
savefig("../diff_reg_ens_muts.png")


close("all")
for i in 1:length(regs)
    plt.plot(mixed[i].ps_ij, mixed[i].ens, label = "reg = $(regs[i])")
end 
plt.plot(alg2.ps_ij, alg2.ens, color = "black", linewidth = 5, label = "No reg")
plt.xlabel("P_ij")
plt.ylabel("DCA energy")
plt.legend()
plt.title("$(double.Mutation_1[1]) + $(double.Mutation_2[1])")
savefig("../diff_reg_performance.png")


orders = [];
for n in 1:5
    diff_positions = []
    for i in 3:length(mixed[n].seqs)
        push!(diff_positions, findall(mixed[n].seqs[i] .!= mixed[n].seqs[i-1])[1])
    end
    push!(orders, diff_positions)
end

orders = hcat(orders...);

pp = hcat([mixed[i].ps_ij[3:end] for i in 1:5]...);
 
sites = []
media = []
varianza = []
for i in 1:222
    mask = orders .== i
    
    println(i)
    if length(mask[:]) > 1
        push!(media, mean(pp[mask][:]))
        push!(varianza, var(pp[mask][:]))
        push!(sites,i)
    end
end
   
close("all"); plt.scatter(media, varianza); plt.xlabel("Mean_p_ij"
    ); plt.ylabel("Var_p_ij"); savefig("../ciao.png")

## Matteo global algo on energy

In [1]:
idx = 7;
    mut1 = double.Mutation_1[idx]
    mut2 = double.Mutation_2[idx]
    m = match(r"\d+", mut1)
    site1 = ndm_r2mod[parse(Int, m.match)]
    m = match(r"\d+", mut2)
    site2 = ndm_r2mod[parse(Int, m.match)]
    println("$(double.Mutation_1[i]) + $(double.Mutation_2[i])")        
    



β = 2000.
npath = 1000
n_steps = 3000
@time paths = Genie.sample_n_paths_ij(site1, site2, NDM1, VIM2, h, J, npath, 
    reg = 0., n_steps = n_steps, β = β);
@time paths2 = Genie.sample_n_paths_ij(site1, site2, NDM1, VIM2, h, J, npath, 
    reg = 0.005, n_steps = n_steps, β = β);

alg2 = Genie.run_Dkl_path_double(site1, site2, n_subs, NDM1, 
    VIM2, h, J); 

@save "../paths_mut7.jld2" paths paths2



close("all")
for i in 1:npath
    plt.plot(paths.scores[i], color = "black")
    plt.plot(paths2.scores[i], color = "red")
end 
plt.xlabel("# iterations")
plt.ylabel("Score (mean P_ij)")
plt.xscale("log")
plt.title("$(double.Mutation_1[idx]) + $(double.Mutation_2[idx])")
#plt.legend()
savefig("../global_score_muts.png")


close("all")
for i in 1:npath
    plt.plot(paths.ens[i], color = "black")
    plt.plot(paths2.ens[i], color = "red")
end 
plt.xlabel("# reversions")
plt.ylabel("DCA energy")
plt.title("$(double.Mutation_1[idx]) + $(double.Mutation_2[idx])")
#plt.legend()
savefig("../global_ens_muts.png")


close("all")
for i in 1:npath
    plt.plot(paths.ps_ij[i], color = "black")
    plt.plot(paths2.ps_ij[i], color = "red")
end 
plt.xlabel("# reversions")
plt.ylabel("P_ij")
plt.title("$(double.Mutation_1[idx]) + $(double.Mutation_2[idx])")
#plt.legend()
savefig("../global_p_ij_muts.png")


pair_d = ham_dist(paths.seqs[1], paths.seqs[2]) 

close("all")
plt.scatter(1:length(pair_d),pair_d)
plt.xlabel("# reversions")
plt.ylabel("Pairwise hamming distance")
plt.title("$(double.Mutation_1[idx]) + $(double.Mutation_2[idx])")
savefig("../pair_paths.png")



LoadError: UndefVarError: `double` not defined

In [None]:
function find_ord(i, paths)
    N =length(paths.muts)
    ords = [findfirst(==(i), paths.muts[n]) for n in 1:N]
    return ords
end
    
function find_all_ord(paths, L)
    idxs = [1.0*i for i in 1:L]
    return [find_ord(i,paths) for i in idxs]
end 

ords = find_all_ord(paths,length(VIM2))
   
diff_sites = findall(NDM1 .!= VIM2);filter!(x -> x !== site1, diff_sites);filter!(x -> x !== site2, diff_sites)

ps_ords = zeros(length(VIM2));
for i in diff_sites
    ll = []
    for n in 1:1000
        push!(ll, paths.ps_ij[n][ords[i][n]])
    end
    ps_ords[i] = mean(ll)
end


ords2 = find_all_ord(paths2,length(VIM2))

ps_ords2 = zeros(length(VIM2));
for i in diff_sites
    ll = []
    for n in 1:1000
        push!(ll, paths2.ps_ij[n][ords2[i][n]])
    end
    ps_ords2[i] = mean(ll)
end



close("all")

# Compute x and y values
x_vals = [mean(ords[i]) for i in diff_sites]
y_vals = [std(ords[i]) for i in diff_sites]
colors = [ps_ords[i] for i in diff_sites]  # color values

# Compute x and y values
x_vals2 = [mean(ords2[i]) for i in diff_sites]
y_vals2 = [std(ords2[i]) for i in diff_sites]
colors2 = [ps_ords2[i] for i in diff_sites]  # color values

# Create scatter plot
plt.scatter(x_vals, y_vals, c=colors,label = "reg = 0", cmap="viridis")
plt.scatter(x_vals2, y_vals2, c=colors2, marker = "*", label = "reg = 0.005", cmap = "viridis")
plt.xlabel("Mean order in the path")
plt.ylabel("Std order in the path")

# Add colorbar and label it
cb = colorbar()
cb.set_label("Mean p_ij of appearance")  # label of the colormap

plt.legend()

savefig("../order.png")

greedy_ord = [findfirst(==(i), alg2.muts) for i in 1:length(VIM2)];

z_scores = [(greedy_ord[i] - mean(ords[i]))/std(ords[i]) for i in diff_sites];

close("all")
# Create scatter plot
plt.plot(x_vals, y_vals, c=z_scores,label = "reg = 0", cmap="viridis")
plt.xlabel("Mean order in the path")
plt.ylabel("Std order in the path")

# Add colorbar and label it
cb = colorbar()
cb.set_label("Z score to greedy algo")  # label of the colormap

plt.legend()

savefig("../order_zscore.png")

 
sel_muts = alg2.muts[16:22]; ## muts with hogher gradient
println(alg2.ps_ij[18])
println(alg2.ps_ij[6])

p_sel = Genie.insert_muts(sel_muts, NDM1, VIM2, site1, site2, h, J);
@time mm, pp = Genie.try_order_muts(sel_muts, 1000, NDM1, VIM2, site1, site2, h, J);
maximum(pp)
  
    

In [None]:
function common_muts(a, b, n_start::Int, n_end::Int)
    return length(intersect(a[n_start:n_end],b[n_start:n_end]))/(n_end-n_start)
end



common = [[common_muts(alg2.muts, paths.muts[i], 1, n)  for i in 1:5] for n in 1:60];

common2 = [[common_muts(alg2.muts, paths.muts[i], n, 60) for i in 1:5] for n in 1:60];

close("all"); plt.plot(common, color = "red");  plt.plot(common2, color = "blue"); savefig("../ecco.png")

## All 5-point reversions

In [None]:
using Combinatorics, OhMyThreads

function all_n_point_reversions(mut_seq::Vector{T}, wt_seq::Vector{T}, n::Int) where T
    @assert length(mut_seq) == length(wt_seq) "Sequences must be of same length"
    # Find positions where:
    # - the sequences differ AND
    # - neither has value 21
    valid_positions = [
        i for i in 1:length(mut_seq)
        if mut_seq[i] != wt_seq[i] && mut_seq[i] != 21 && wt_seq[i] != 21
    ]

    if length(valid_positions) < n
        return []
    end

    reverted_seqs = []

    for combo in combinations(valid_positions, n)
        new_seq = copy(mut_seq)
        for idx in combo
            new_seq[idx] = wt_seq[idx]
        end
        push!(reverted_seqs, new_seq)
    end

    return reverted_seqs
end


In [None]:
mm = [];

for i in 1:8
    mut1 = double.Mutation_1[1]
    mut2 = double.Mutation_2[1]
   
    a1i = KitMSA.letter2num(first(mut1))
    a2i = KitMSA.letter2num(first(mut2))
    
    a1f = KitMSA.letter2num(last(mut1))
    a2f = KitMSA.letter2num(last(mut2))
    m = match(r"\d+", mut1)
    site1 = ndm_r2mod[parse(Int, m.match)]
    m = match(r"\d+", mut2)
    site2 = ndm_r2mod[parse(Int, m.match)]

start_seq = deepcopy(NDM1)
start_seq[site1] = VIM2[site1]
start_seq[site2] = VIM2[site2]


@time res = [all_n_point_reversions(start_seq, VIM2, n) for n in 1:4];
ps = [zeros(binomial(134,k)) for k in 1:4]; @time for idx in 1:4
    @tasks for j in 1:length(res[idx])
        ps[idx][j] = Genie.dist_proba_2_point(site1,site2,res[idx][j],h, 
            J, L, q, 1.)[a1f,a2f]
    end
end

ps_max = [maximum(ps[a]) for a in 1:4];
println(ps_max .- doubles[i].ps_ij[1:4])

push!(mm, ps_max)
    
end





## Beyond reversions

In [None]:
singles = [];doubles = []; n_subs = 130; for i in 1:8
    mut1 = double.Mutation_1[i]
    mut2 = double.Mutation_2[i]
   
    a1i = KitMSA.letter2num(first(mut1))
    a2i = KitMSA.letter2num(first(mut2))
    
    a1f = KitMSA.letter2num(last(mut1))
    a2f = KitMSA.letter2num(last(mut2))
    m = match(r"\d+", mut1)
    site1 = ndm_r2mod[parse(Int, m.match)]
    m = match(r"\d+", mut2)
    site2 = ndm_r2mod[parse(Int, m.match)]
    
   
    @time alg = Genie.run_Dkl_path_double_beyond_rev(site1, site2, n_subs, NDM1, 
    VIM2, h, J); @time alg_rev = Genie.run_Dkl_path_double(site1, site2, n_subs, NDM1, 
           VIM2, h, J); 
    push!(singles, alg)
    push!(doubles, alg_rev)
end


color_names = ["red", "orange", "yellow", "green", "blue", "purple", "brown", "gray"]


close("all")
for i in 1:length(singles)
    plt.plot(doubles[i].ps_ij,  c = color_names[i],  label = "P_ij $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
    plt.plot(singles[i].ps_ij, linestyle = "--",  c = color_names[i], label = "Free $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
end 
plt.xlabel("# reversions")
plt.ylabel("P_ij")
plt.legend()
savefig("../beyond_rev_p_ij_muts.png")


close("all")
for i in 1:length(singles)
    plt.plot(doubles[i].ens, c = color_names[i], label = "P_ij $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
    plt.plot(singles[i].ens, linestyle = "--",  c = color_names[i], label = "Free $(double.Mutation_1[i]) + $(double.Mutation_2[i])")
end 
plt.xlabel("# reversions")
plt.ylabel("DCA energy")
plt.legend()
savefig("../beyond_rev_ens_muts.png")

In [None]:

    mut1 = double.Mutation_1[1]
    mut2 = double.Mutation_2[1]
   
    a1i = KitMSA.letter2num(first(mut1))
    a2i = KitMSA.letter2num(first(mut2))
    
    a1f = KitMSA.letter2num(last(mut1))
    a2f = KitMSA.letter2num(last(mut2))
    m = match(r"\d+", mut1)
    site1 = ndm_r2mod[parse(Int, m.match)]
    m = match(r"\d+", mut2)
    site2 = ndm_r2mod[parse(Int, m.match)]
  mixed = []; regs =  [0., 0.0001, 0.001, 0.01, 0.1, 1.]; n_subs = 130;    for i in 1:length(regs)   
    @time alg = Genie.run_Dkl_path_double_beyond_rev(site1, site2, n_subs, NDM1, 
    VIM2, h, J, reg = regs[i]); 
    push!(mixed, alg)
end

alg2 = Genie.run_Dkl_path_double(site1, site2, n_subs, NDM1, 
    VIM2, h, J); 


close("all")
for i in 1:length(regs)
    plt.plot(mixed[i].ps_ij, label = "reg = $(regs[i])")
end
plt.plot(alg2.ps_ij, color = "black", linewidth = 4, label = "Only reversions")
plt.xlabel("# reversions")
plt.ylabel("P_ij")
plt.legend()
plt.title("$(double.Mutation_1[1]) + $(double.Mutation_2[1])")
savefig("../new_diff_reg_p_ij_muts.png")


close("all")
for i in 1:length(regs)
    plt.plot(mixed[i].ens, label = "reg = $(regs[i])")
end 
plt.plot(alg2.ens, color = "black", linewidth = 4, label = "Only reversions")
plt.xlabel("# reversions")
plt.ylabel("DCA energy")
plt.legend()
plt.title("$(double.Mutation_1[1]) + $(double.Mutation_2[1])")
savefig("../new_diff_reg_ens_muts.png")


close("all")
for i in 1:length(regs)
    plt.plot(mixed[i].ps_ij, mixed[i].ens, label = "reg = $(regs[i])")
end 
plt.plot(alg2.ps_ij, alg2.ens, color = "black", linewidth = 5, label = "Only reversions")
plt.xlabel("P_ij")
plt.ylabel("DCA energy")
plt.legend()
plt.title("$(double.Mutation_1[1]) + $(double.Mutation_2[1])")
savefig("../new_diff_reg_performance.png")


## DBD

In [1]:
using Revise, Genie, KitMSA, PyPlot, JLD2, Statistics, StatsBase, DCAUtils, DelimitedFiles, CSV, DataFrames

nat_msa  = read_fasta_alignment("../Gen.jl/data/alignments/natural/DBD_alignment.uniref90.cov80.a2m", 0.9);
w = compute_weights(nat_msa, 22, 0.2)[1];

@load "../data_Genie/pars_dbd.jld2"; 
J = J_dbd; h = h_dbd;

seq_in = Int.(nat_msa[:,1]); seq_fin = Int.(nat_msa[:,10000]); q = 21; L = 76;




pc_nat, pc_sil = check_pca(Int8.(nat_msa), Int8.(hcat([seq_fin, seq_in]...)))

close("all")
plt.scatter(pc_nat[:,1], pc_nat[:,2], label = "Natural sequences")
plt.scatter(pc_sil[1,1], pc_sil[1,2], label = "seq_fin")
plt.scatter(pc_sil[2,1], pc_sil[2,2], label = "seq_in")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
savefig("../pca_dbd.png")

ee = Genie.energy(nat_msa, h, J); 
en_in = Genie.energy(seq_in, h, J, L);
en_fin = Genie.energy(seq_fin, h, J, L);

close("all"); plt.hist(ee, linewidth = 3.0, histtype = "step", density = true, label = "nat"); 
plt.plot([en_fin, en_fin], [0., 10^-2], linewidth = 3.0, label = "seq_fin");
plt.plot([en_in, en_in], [0., 10^-2], linewidth = 3.0, label = "seq_in");
plt.legend(); plt.xlabel("DCA energy");plt.yscale("log"); savefig("../ens_dbd_nat.png")





p_ijs_in = zeros(L,L);
p_ijs_fin = zeros(L,L);

@time for site1 in 1:L
    for site2 in site1+1:L
        if (seq_fin[site1] !== seq_in[site1]) && (seq_fin[site2] !== 
                seq_in[site2]) && (seq_fin[site1] !== 21 ) && (seq_fin[site2] !== 21 
                ) && (seq_in[site1] !== 21 ) && (seq_in[site2] !== 21 )
        p_ijs_in[site1,site2] = Genie.dist_proba_2_point(site1, site2, seq_in, h, 
            J, L, q, 1.)[seq_in[site1], seq_in[site2]]
        p_ijs_fin[site1,site2] = Genie.dist_proba_2_point(site1, site2, seq_fin, h, 
            J, L, q, 1.)[seq_fin[site1], seq_fin[site2]]
        end
    end
end

score = p_ijs_fin .- p_ijs_in

algs = [];

for _ in 1:8
    site1, site2 = Tuple(argmax(score))
    println(site1, site2)
    score[site1, site2] = -10000
    n_subs = 40;
    alg = Genie.run_Dkl_path_double(site1, site2, n_subs, seq_in, seq_fin, h, J, temp = 1.0); 
    push!(algs, alg)

end



close("all")
for i in 1:8
    plt.plot(algs[i].ps_ij)
end
plt.xlabel("# reversions")
plt.ylabel("P_ij")
plt.title("DBD")
savefig("../dbd_p_ij_muts.png")


close("all")
for i in 1:8
    plt.plot(algs[i].ens)
end
plt.xlabel("# reversions")
plt.ylabel("DCA energy")
plt.title("DBD")
savefig("../dbd_ens_muts.png")


LoadError: SystemError: ../Gen.jl/data/alignments/natural/DBD_alignment.uniref90.cov80.a2m: No such file or directory

In [3]:
mask = (seq_in .!== 21)  .& (seq_fin .!== 21) 


LoadError: UndefVarError: `seq_in` not defined

In [None]:
Genie.ham_dist(seq_in, seq_fin[seq_fin .!== 21]) 