In [None]:
ENV["PYTHON"] = "" 
using Pkg
Pkg.build("PyCall")  # Rebuild PyCall to use the internal Python
using Revise, PyPlot, DelimitedFiles, Genie
ENV["PYTHON"] = "" 
using Pkg
Pkg.build("PyCall")  # Rebuild PyCall to use the internal Python
using PyPlot, DelimitedFiles
using Revise, Genie, DelimitedFiles, DCAUtils, JLD2, PyPlot, Statistics, LinearAlgebra
import KitMSA: fasta2matrix, matrix2fasta


function encode_codons_to_nuc_int(codon_seq::Vector{String})
    # Mapping nucleotides to Integers
    nuc_map = Dict('A'=>1, 'C'=>2, 'G'=>3, 'T'=>4, '-'=>5, 'N'=>5)
    
    encoded = Int[]
    
    for codon in codon_seq
        for char in codon
            # Convert each character of the codon string
            push!(encoded, get(nuc_map, uppercase(char), 5))
        end
    end
    return encoded
end


function read_dna_to_int(filename::String)
    sequences = Int[]
    current_seq = ""
    n_seqs = 0
    
    # Dizionario di codifica
    encoding = Dict('A'=>1, 'C'=>2, 'G'=>3, 'T'=>4, 'a'=>1, 'c'=>2, 'g'=>3, 't'=>4, '-'=>5)

    lines = readlines(filename)
    
    # Estraiamo solo le sequenze ignorando gli header
    seqs_list = String[]
    temp_seq = ""
    for line in lines
        if startswith(line, ">")
            if !isempty(temp_seq)
                push!(seqs_list, temp_seq)
                temp_seq = ""
            end
        else
            temp_seq *= strip(line)
        end
    end
    push!(seqs_list, temp_seq) # Ultima sequenza

    # Conversione in Matrice L x M
    L = length(seqs_list)
    M = length(seqs_list[1])
    msa_int = zeros(Int, L, M)

    for i in 1:L
        for j in 1:M
            char = seqs_list[i][j]
            msa_int[i, j] = get(encoding, char, 5) # Default al gap se carattere ignoto
        end
    end
    
    return Int8.(msa_int')
end



wt_name = "TEM"
file_wt = "../data_directed_evolution/TEM1_pfam_DNA.fasta"
file_model = "../model_natural_PSE.dat"

# Read model parameters
println("Reading model parameters")
if wt_name == "AAC" || wt_name == "DHFR"
        h_tmp, J_tmp = read_par_BM_0gapsave(file_model) 
else
        h_tmp, J_tmp = read_par_BM_lettersave(file_model)
end
h = set_max_field_to_0save(h_tmp);
J_tmp2 = symmetrize_Jsave(J_tmp); 
J = permutedims(J_tmp2, [1,3,2,4]);


# Reading sequences in the seed
println("Reading sequences in the seed")
wt = join(readdlm(file_wt, skipstart = 1));
L_big = round(Int,length(wt)/3)
if wt_name == "AAC" || wt_name == "DHFR"
        wt_DNA_seq = [wt[((i-1)*3 +1):(i*3)] for i in 1:L_big];
else
        wt_DNA_seq = [wt[((i-1)*3 +1):(i*3)] for i in 3:L_big-1];
end

wt_seq = encode_codons_to_nuc_int(wt_DNA_seq)
wt_amino_seq = [cod2amino[x] for x in wt_DNA_seq]

Nseqs = 10^4
rounds = 8
mu = 0.014
mu_bind = 0.0
Npairs = 100
temp = 0.0

@time res = Genie.run_dir_evol_nucleo(wt_DNA_seq, Nseqs, h, J;
                   rounds = rounds,
                   seq_reads = Nseqs,
                   temp = temp,  
                   mu = mu,  
		   mu_bind = mu_bind, 
		   neutral = true);

@time res_repl = Genie.run_dir_evol_nucleo(wt_DNA_seq, Nseqs, h, J;
                   rounds = rounds,
                   seq_reads = Nseqs,
                   temp = temp,  
                   mu = mu, 
		   mu_bind = mu_bind, 
		   neutral = true);

Genie.dna2fasta("../tem_neutral.fa", res.final_msa_dna)
Genie.dna2fasta("../tem_neutral_repl.fa", res_repl.final_msa_dna)


msa_dna = read_dna_to_int("../tem_neutral.fa")
msa_dna_repl = read_dna_to_int("../tem_neutral_repl.fa")

@time f1, _ = compute_weighted_frequencies(msa_dna, 6, 0.); f1 = reshape(f1, 5, 597);
@time f1_repl, _ = compute_weighted_frequencies(msa_dna_repl, 6, 0.); f1_repl = reshape(f1_repl, 5, 597);

for (i, aa) in enumerate(wt_seq)
    f1[aa, i] = NaN # Mask Wild-Type
    f1_repl[aa, i] = NaN # Mask Wild-Type
end
f = filter(isfinite, vec(f1))
f_repl = filter(isfinite, vec(f1_repl))

off = 1e-9;
close("all"); 
plt.scatter(f .+ off, f_repl .+ off, s=4, alpha=0.6) # Identity Line
plt.plot([1e-6, 1], [1e-6, 1], color="red", linestyle="--", linewidth=0.8, alpha=0.7)
plt.xscale("log")
plt.yscale("log")
savefig("../tem_repl_freqs_dna.png")



In [None]:
ENV["PYTHON"] = "" 
using Pkg
Pkg.build("PyCall")  # Rebuild PyCall to use the internal Python
using Revise, PyPlot, DelimitedFiles, Genie
ENV["PYTHON"] = "" 
using Pkg
Pkg.build("PyCall")  # Rebuild PyCall to use the internal Python
using PyPlot, DelimitedFiles
using Revise, Genie, DelimitedFiles, DCAUtils, JLD2, PyPlot, Statistics, LinearAlgebra
import KitMSA: fasta2matrix, matrix2fasta


function encode_codons_to_nuc_int(codon_seq::Vector{String})
    # Mapping nucleotides to Integers
    nuc_map = Dict('A'=>1, 'C'=>2, 'G'=>3, 'T'=>4, '-'=>5, 'N'=>5)
    
    encoded = Int[]
    
    for codon in codon_seq
        for char in codon
            # Convert each character of the codon string
            push!(encoded, get(nuc_map, uppercase(char), 5))
        end
    end
    return encoded
end


function read_dna_to_int(filename::String)
    sequences = Int[]
    current_seq = ""
    n_seqs = 0
    
    # Dizionario di codifica
    encoding = Dict('A'=>1, 'C'=>2, 'G'=>3, 'T'=>4, 'a'=>1, 'c'=>2, 'g'=>3, 't'=>4, '-'=>5)

    lines = readlines(filename)
    
    # Estraiamo solo le sequenze ignorando gli header
    seqs_list = String[]
    temp_seq = ""
    for line in lines
        if startswith(line, ">")
            if !isempty(temp_seq)
                push!(seqs_list, temp_seq)
                temp_seq = ""
            end
        else
            temp_seq *= strip(line)
        end
    end
    push!(seqs_list, temp_seq) # Ultima sequenza

    # Conversione in Matrice L x M
    L = length(seqs_list)
    M = length(seqs_list[1])
    msa_int = zeros(Int, L, M)

    for i in 1:L
        for j in 1:M
            char = seqs_list[i][j]
            msa_int[i, j] = get(encoding, char, 5) # Default al gap se carattere ignoto
        end
    end
    
    return Int8.(msa_int')
end



wt_name = "TEM"
file_wt = "../data_directed_evolution/TEM1_pfam_DNA.fasta"
file_model = "../model_natural_PSE.dat"

# Read model parameters
println("Reading model parameters")
if wt_name == "AAC" || wt_name == "DHFR"
        h_tmp, J_tmp = read_par_BM_0gapsave(file_model) 
else
        h_tmp, J_tmp = read_par_BM_lettersave(file_model)
end
h = set_max_field_to_0save(h_tmp);
J_tmp2 = symmetrize_Jsave(J_tmp); 
J = permutedims(J_tmp2, [1,3,2,4]);


#file_wt = "../data_directed_evolution/PSE1_pfam_DNA.fasta"

# Reading sequences in the seed
println("Reading sequences in the seed")
wt = join(readdlm(file_wt, skipstart = 1));
L_big = round(Int,length(wt)/3)
if wt_name == "AAC" || wt_name == "DHFR"
        wt_DNA_seq = [wt[((i-1)*3 +1):(i*3)] for i in 1:L_big];
else
        wt_DNA_seq = [wt[((i-1)*3 +1):(i*3)] for i in 3:L_big-1];
end

wt_seq = encode_codons_to_nuc_int(wt_DNA_seq)
wt_amino_seq = [cod2amino[x] for x in wt_DNA_seq]

Nseqs = 10^4

#for Nseqs in [10^3, 10^3, 10^5]
rounds = 8
mu = 0.014
mu_bind = 0.0
Npairs = 100
temp = 0.0

@time res = Genie.run_neutral_evol_nucleo(wt_DNA_seq, Nseqs;
                   rounds = rounds, 
                   mu = mu);


@time res_repl = Genie.run_neutral_evol_nucleo(wt_DNA_seq, Nseqs;
                   rounds = rounds, 
                   mu = mu);


Genie.dna2fasta("../tem_neutral_new_$(Nseqs).fa", res.final_msa_dna)
Genie.dna2fasta("../tem_neutral_repl_new_$(Nseqs).fa", res_repl.final_msa_dna)

msa_dna = read_dna_to_int("../tem_neutral_new_$(Nseqs).fa")
msa_dna_repl = read_dna_to_int("../tem_neutral_repl_new_$(Nseqs).fa")

@time f1, _ = compute_weighted_frequencies(msa_dna, 6, 0.); f1 = reshape(f1, 5, 597);
@time f1_repl, _ = compute_weighted_frequencies(msa_dna_repl, 6, 0.); f1_repl = reshape(f1_repl, 5, 597);

for (i, aa) in enumerate(wt_seq)
    f1[aa, i] = NaN # Mask Wild-Type
    f1_repl[aa, i] = NaN # Mask Wild-Type
end
f = filter(isfinite, vec(f1))
f_repl = filter(isfinite, vec(f1_repl))

off = 0.;

close("all"); 
plt.scatter(f .+ off, f_repl .+ off, s=4, alpha=0.6) # Identity Line
plt.plot([1e-6, 1], [1e-6, 1], color="red", linestyle="--", linewidth=0.8, alpha=0.7)
plt.xscale("log")
plt.yscale("log")
savefig("../new_tem_repl_freqs_dna_$(Nseqs).png")
#end

distances = [];
for i in 1:length(wt_DNA_seq)
    push!(distances,  mean([3-sum(collect(wt_DNA_seq[i]) .== collect(stop_codons[j])) for j in 1:3]))
end

stretched = repeat(distances, inner=3)

color_matrix = zeros(5, 597); 
for i in 1:5 
    color_matrix[i,:] .= stretched 
end

for (i, aa) in enumerate(wt_seq)
    color_matrix[aa, i] = NaN # Mask Wild-Type
end 

color_filt = filter(isfinite, vec(color_matrix))

off = 0.;

close("all"); 
plt.scatter(f .+ off, f_repl .+ off, s=40, c = color_filt, alpha=0.6) # Identity Line
plt.plot([1e-4, 1], [1e-4, 1], color="red", linestyle="--", linewidth=0.8, alpha=0.7)
plt.xscale("log")
plt.yscale("log")
plt.colorbar()
savefig("../new_pse_repl_freqs_dna_$(Nseqs).png")

#end

In [None]:
Nseqs = 10^3
msa_dna_small = read_dna_to_int("../tem_neutral_new_$(Nseqs).fa")
msa_dna_repl_small = read_dna_to_int("../tem_neutral_repl_new_$(Nseqs).fa")

@time f1_small, _ = compute_weighted_frequencies(msa_dna_small, 6, 0.); f1_small = reshape(f1_small, 5, 597);
@time f1_repl_small, _ = compute_weighted_frequencies(msa_dna_repl_small, 6, 0.); f1_repl_small = reshape(f1_repl_small, 5, 597);

for (i, aa) in enumerate(wt_seq)
    f1_small[aa, i] = NaN # Mask Wild-Type
    f1_repl_small[aa, i] = NaN # Mask Wild-Type
end
f = filter(isfinite, vec(f1_small))
f_repl = filter(isfinite, vec(f1_repl_small))


close("all"); 
plt.hist(f1[:], histtype = "step", label = "10^5");
plt.hist(f1_small[:], histtype = "step", label = "10^3"); 
plt.xlabel("freq"); 
plt.legend();
savefig("../ciao.png")



using StatsBase
using PyPlot

idx = findall(x -> 0.001 < x < 0.018, f1)

#idx = findall(x -> x > 0.01, f1)
# ----------------------------
# Estrai lettere e siti
# ----------------------------
letters = getindex.(idx, 1)   # 1..5
sites   = getindex.(idx, 2)    # 1..597

# ----------------------------
# Parametri
# ----------------------------
q = 5
L = 597

# ----------------------------
# Profili di abbondanza
# ----------------------------
count_letters = countmap(letters)
count_sites   = countmap(sites)

letters_profile = [get(count_letters, l, 0) for l in 1:q]
sites_profile   = [get(count_sites,   s, 0) for s in 1:L]

# ----------------------------
# Plot
# ----------------------------
figure(figsize=(12,4))

# Lettere
subplot(1,2,1)
bar(1:q, letters_profile)
title("aminoacids")
xticks(1:q)

# Siti
subplot(1,2,2)
plot(1:L, sites_profile, linewidth=2)
title("Sites")

tight_layout()

# ----------------------------
# Salva figura
# ----------------------------
savefig("../profilo_lettere_siti.png", dpi=300, bbox_inches="tight")

show()


In [None]:
Nseqs = 3+10^5

rounds = 8
mu = 0.014
mu_bind = 0.0
Npairs = 100
temp = 0.0

#wt_DNA_seq2 = ["AAA" for i in 1:199]
#wt_seq2 = [1 for i in 1:597]
@time res = Genie.run_neutral_evol_nucleo_nofilt(wt_DNA_seq, Nseqs;
                   rounds = rounds, 
                   mu = mu);


@time res_repl = Genie.run_neutral_evol_nucleo_nofilt(wt_DNA_seq, Nseqs;
                   rounds = rounds, 
                   mu = mu);


Genie.dna2fasta("../TEM_neutral_new_$(Nseqs).fa", res.final_msa_dna)
Genie.dna2fasta("../TEM_neutral_repl_new_$(Nseqs).fa", res_repl.final_msa_dna)

msa_dna = read_dna_to_int("../TEM_neutral_new_$(Nseqs).fa")
msa_dna_repl = read_dna_to_int("../TEM_neutral_repl_new_$(Nseqs).fa")

@time f1, _ = compute_weighted_frequencies(msa_dna, 6, 0.); f1 = reshape(f1, 5, 597);
@time f1_repl, _ = compute_weighted_frequencies(msa_dna_repl, 6, 0.); f1_repl = reshape(f1_repl, 5, 597);

for (i, aa) in enumerate(wt_seq)
    f1[aa, i] = NaN # Mask Wild-Type
    f1_repl[aa, i] = NaN # Mask Wild-Type
end
f = filter(isfinite, vec(f1))
f_repl = filter(isfinite, vec(f1_repl))

off = 0.;

close("all"); 
plt.scatter(f .+ off, f_repl .+ off, s=4, alpha=0.6) # Identity Line
#plt.plot([1e-6, 1], [1e-6, 1], color="red", linestyle="--", linewidth=0.8, alpha=0.7)
plt.xscale("log")
plt.yscale("log")
savefig("../new_TEM_repl_freqs_dna_$(Nseqs)_nofilt.png")
#end