In [None]:
ENV["PYTHON"] = "" 
using Pkg
Pkg.build("PyCall")  # Rebuild PyCall to use the internal Python
ENV["PYTHON"] = "" 
using Pkg
Pkg.build("PyCall")  # Rebuild PyCall to use the internal Python
using Revise, Genie, DelimitedFiles, DCAUtils, JLD2, PyPlot, Statistics, LinearAlgebra
import KitMSA: fasta2matrix, matrix2fasta


function read_fasta_headers(filepath::String)
   headers = String[]
   open(filepath, "r") do io
       for line in eachline(io)
           if startswith(line, '>')
               push!(headers, strip(line[5:end]))  # remove '>' and strip whitespace
           end
       end
   end
   return headers
end

function read_fasta_sequences(filepath::String)
    sequences = String[]
    current_seq = IOBuffer()

    open(filepath, "r") do io
        for line in eachline(io)
            if startswith(line, '>')
                if position(current_seq) > 0
                    push!(sequences, String(take!(current_seq)))
                end
            else
                write(current_seq, strip(line))
            end
        end
        # Push last sequence if file doesn't end with a header
        if position(current_seq) > 0
            push!(sequences, String(take!(current_seq)))
        end
    end

    return sequences
end



In [None]:
nat_sequences = read_fasta_sequences("../data_directed_evolution/HIV/HIV1_ALL_2022_2253-2549_DNA.fasta");
nat_headers = read_fasta_headers("../data_directed_evolution/HIV/HIV1_ALL_2022_2253-2549_DNA.fasta");



pat_sequences = [];
pat_headers = [];
for i in 1:11
    push!(pat_sequences, read_fasta_sequences("../data_directed_evolution/HIV/haplotypes_p$(i)_PR.fasta"))
    push!(pat_headers, read_fasta_headers("../data_directed_evolution/HIV/haplotypes_p$(i)_PR.fasta"))
end

using StatsBase
using PyPlot

# --- Extract days since infection for each patient ---
days = [[parse(Int, match(r"days since infection: (\d+)", h).captures[1])
         for h in pat_headers[i]] for i in 1:length(pat_headers)]

# --- Count number of samples per day for each patient ---
counts = [countmap(d) for d in days]

# --- Prepare x (days) and y (counts) per patient ---
xs = [collect(keys(c)) for c in counts]
ys = [collect(values(c)) for c in counts]

# --- Plot ---
figure()
for i in 1:length(xs)
    scatter(xs[i], ys[i], label="patient $i", s=30)  # s=marker size
end

xlabel("Days since infection")
ylabel("Number of samples")
title("Sample distribution per patient over time")
legend()

# --- Save figure ---
savefig("../patient_samples_scatter.png", dpi=300, bbox_inches="tight")


function dna_to_amino_indices(seqs::Vector{String}, cod2amino::Dict{String,Int8})
    aa_seqs = Vector{Vector{Int8}}(undef, length(seqs))  # <- store as Int8

    for (idx, seq) in enumerate(seqs)
        # Step 1: split into codons
        codons = [seq[i:i+2] for i in 1:3:length(seq)-2 if !occursin("-", seq[i:i+2])]
        
        # Step 2: map codons to amino acids
        aa_seqs[idx] = [cod2amino[c] for c in codons if haskey(cod2amino, c)]
    end

    return aa_seqs
end

# Usage
nat_aa_sequences = dna_to_amino_indices(nat_sequences, cod2amino)
Ls = [length(nat_aa_sequences[i]) for i in 1:length(nat_aa_sequences)]
nat_aa_seqs = remove_duplicate_sequences(hcat(nat_aa_sequences[Ls .== 99] ...))
@time d_pair = pairwise_ham_dist(nat_aa_seqs, n_seq = 4076, all = true);
close("all"); plt.hist(d_pair ./ 99, histtype = "step", linewidth = 3., density = true
    ); plt.xlabel("Pairwise hamming");savefig("../HIV_nat_pairwise.png")



pat_aa_sequences = [dna_to_amino_indices(pat_sequences[i], cod2amino) for i in 1:11]
pat_Ls = [[length(pat_aa_sequences[i][j]) for j in 1:length(pat_aa_sequences[i])] for i in 1:11]
pat_aa_seqs = [hcat(pat_aa_sequences[i][pat_Ls[i] .== 99] ...) for i in 1:11]
d7 = [ham_dist(pat_aa_seqs[7][:,i], nat_aa_seqs) for i in 1:size(pat_aa_seqs[7],2)]

close("all"); plt.hist(vcat(d7...) ./ 99, histtype = "step", linewidth = 3., density = true
    ); plt.xlabel("Hamming from natural");savefig("../HIV_nat_p7.png")