In [22]:
using CSV, DataFrames, Plots, Plots.Measures, StatsBase

In [23]:
# Trajectories are based on this tree topology, branches indexed 1-8 from left to right:
#   Af Ar Ap Sm Mm
#   |__|  |  |  |
#     |___|  |  |
#       |____|  |
#          |____|
#             |

branch_lengths = [3.1,3.1,47.6,50.7,38.3,89.0,64.0,153.0,0.0]

function trajectory2pattern(trajectory::BitVector)
    pattern = fill(trajectory[9], 5)
    pattern[5] = pattern[5] ⊻ trajectory[8]
    pattern[4] = pattern[4] ⊻ trajectory[7] ⊻ trajectory[6]
    pattern[3] = pattern[3] ⊻ trajectory[7] ⊻ trajectory[5] ⊻ trajectory[4]
    pattern[2] = pattern[2] ⊻ trajectory[7] ⊻ trajectory[5] ⊻ trajectory[3] ⊻ trajectory[2]
    pattern[1] = pattern[1] ⊻ trajectory[7] ⊻ trajectory[5] ⊻ trajectory[3] ⊻ trajectory[1]
    pattern
end

function gains_and_losses(trajectory::BitVector)
    gains = zeros(Int, 9)
    losses = zeros(Int, 9)
    #ancestral state
    if trajectory[9]
        gains[9] += 1
    else
        losses[9] += 1
    end
    #Mm
    lca = trajectory[9] 
    if lca && trajectory[8]         #loss
        losses[8] += 1
    end
    if lca && trajectory[7]     #loss
        losses[7] += 1
    end
    if ~lca && trajectory[8]            #gain
        gains[8] += 1
    end
    if ~lca && trajectory[7]             #gain
        gains[7] += 1
    end
    #Sm
    lca = trajectory[9] ⊻ trajectory[7]
    if lca && trajectory[6]         #loss
        losses[6] += 1
    end
    if lca && trajectory[5]     #loss
        losses[5] += 1
    end
    if ~lca && trajectory[6]            #gain
        gains[6] += 1
    end
    if ~lca && trajectory[5]             #gain
        gains[5] += 1
    end
    #Ap
    lca = trajectory[9] ⊻ trajectory[7] ⊻ trajectory[5]
    if lca && trajectory[4]         #loss
        losses[4] += 1
    end
    if lca && trajectory[3]     #loss
        losses[3] += 1
    end
    if ~lca && trajectory[4]            #gain
        gains[4] += 1
    end
    if ~lca && trajectory[3]            #gain
        gains[3] += 1
    end
    #Ar/Af
    lca = trajectory[9] ⊻ trajectory[7] ⊻ trajectory[5] ⊻ trajectory[3]
    if lca && trajectory[2]         #loss
        losses[2] += 1
    end
    if lca && trajectory[1]     #loss
        losses[1] += 1
    end
    if ~lca && trajectory[2]            #gain
        gains[2] += 1
    end
    if ~lca && trajectory[1]             #gain
        gains[1] += 1
    end
    gains, losses
end

function extend(perms::Vector{BitVector})
    newperms = Vector{BitVector}()
    for perm in perms
        push!(newperms, push!(copy(perm), 0))
        push!(newperms, push!(perm, 1))
    end
    newperms
end

function bitperm(length::Int)
    perms = Vector{BitVector}()
    push!(perms, BitVector())
    for n in 1:length
        perms = extend(perms)
    end
    perms
end

bitperm (generic function with 1 method)

In [24]:
editing_pattern_perms = bitperm(5)
evolutionary_trajectory_perms = bitperm(9)

evolutionary_trajectories = DataFrame(trajectory = BitVector[], pattern = BitVector[], parsimony = Int[], interval = Float64[])
for t in evolutionary_trajectory_perms
    p = trajectory2pattern(t)
    push!(evolutionary_trajectories, (t, p, sum(t[1:8]), sum(branch_lengths[t])))
end
evolutionary_trajectories

Row,trajectory,pattern,parsimony,interval
Unnamed: 0_level_1,BitVector,BitVector,Int64,Float64
1,"Bool[0, 0, 0, 0, 0, 0, 0, 0, 0]","Bool[0, 0, 0, 0, 0]",0,0.0
2,"Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]","Bool[1, 1, 1, 1, 1]",0,0.0
3,"Bool[0, 0, 0, 0, 0, 0, 0, 1, 0]","Bool[0, 0, 0, 0, 1]",1,153.0
4,"Bool[0, 0, 0, 0, 0, 0, 0, 1, 1]","Bool[1, 1, 1, 1, 0]",1,153.0
5,"Bool[0, 0, 0, 0, 0, 0, 1, 0, 0]","Bool[1, 1, 1, 1, 0]",1,64.0
6,"Bool[0, 0, 0, 0, 0, 0, 1, 0, 1]","Bool[0, 0, 0, 0, 1]",1,64.0
7,"Bool[0, 0, 0, 0, 0, 0, 1, 1, 0]","Bool[1, 1, 1, 1, 1]",2,217.0
8,"Bool[0, 0, 0, 0, 0, 0, 1, 1, 1]","Bool[0, 0, 0, 0, 0]",2,217.0
9,"Bool[0, 0, 0, 0, 0, 1, 0, 0, 0]","Bool[0, 0, 0, 1, 0]",1,89.0
10,"Bool[0, 0, 0, 0, 0, 1, 0, 0, 1]","Bool[1, 1, 1, 0, 1]",1,89.0


In [25]:
organelle = "mt"
sites = CSV.File("../data/$organelle/edit_sites/all_" * organelle * "_sites.tsv") |> DataFrame
#= # filter out sites with max editing < 0.5
function significant_site(site)
    any(skipmissing([site.af, site.ap, site.ar, site.sm, site.mm]) .>= 0.5) && return true
    false
end
filter!(x -> significant_site(x), sites) =#

# generate editing patterns
epatterns = BitVector[]
for site in eachrow(sites)
    push!(epatterns, .~ismissing.([site.af,site.ar,site.ap,site.sm,site.mm]))
end
sites.epattern = epatterns

bypattern = groupby(evolutionary_trajectories, :pattern)

mp_trajectories = BitVector[]
for p in epatterns
    possible_trajectories = bypattern[(pattern = p,)]
    maxpar = first(sort(possible_trajectories,:parsimony).parsimony)
    most_parsimonious = possible_trajectories[possible_trajectories.parsimony .== maxpar, :]
    push!(mp_trajectories, most_parsimonious[sample(1:nrow(most_parsimonious), Weights(most_parsimonious.interval)), :trajectory])
end
sites.mp_trajectory = mp_trajectories
sites

Row,uid,synonymous,creates_start,creates_stop,removes_stop,af,ar,ap,sm,mm,epattern,mp_trajectory
Unnamed: 0_level_1,String31,Bool,Bool,Bool,Bool,Float64?,Float64?,Float64?,Float64?,Float64?,BitVector,BitVector
1,nad1eU2,false,true,false,false,0.783477,0.789587,0.979823,0.707998,0.673296,"Bool[1, 1, 1, 1, 1]","Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]"
2,nad1eU48,true,false,false,false,0.964773,0.947472,0.984967,0.918747,0.768791,"Bool[1, 1, 1, 1, 1]","Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]"
3,nad1eC52,false,false,false,false,0.914793,0.981148,0.982638,0.949271,0.958699,"Bool[1, 1, 1, 1, 1]","Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,nad1eU62,false,false,false,false,0.850588,0.900706,0.968411,0.912445,0.972368,"Bool[1, 1, 1, 1, 1]","Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]"
5,nad1eU77,false,false,false,false,0.983301,0.987214,0.992654,0.974023,0.963365,"Bool[1, 1, 1, 1, 1]","Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]"
6,nad1eU83,false,false,false,false,0.961727,0.983077,0.990226,0.96946,0.961838,"Bool[1, 1, 1, 1, 1]","Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]"
7,nad1eU108,true,false,false,false,0.848101,0.952073,0.924956,0.154754,0.963846,"Bool[1, 1, 1, 1, 1]","Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]"
8,nad1eC112,false,false,false,true,0.907484,0.955455,0.942531,0.935439,0.922621,"Bool[1, 1, 1, 1, 1]","Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]"
9,nad1eC128,false,false,false,false,0.908873,0.954293,0.946806,0.871972,0.922306,"Bool[1, 1, 1, 1, 1]","Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]"
10,nad1eU143,false,false,false,false,0.845098,0.942611,0.922639,0.851991,0.901762,"Bool[1, 1, 1, 1, 1]","Bool[0, 0, 0, 0, 0, 0, 0, 0, 1]"


In [26]:
evolutionary_changes = DataFrame(partition = String[], B1 = Tuple{Int, Int}[], B2 = Tuple{Int, Int}[], B3 = Tuple{Int, Int}[], B4 = Tuple{Int, Int}[],
     B5 = Tuple{Int, Int}[], B6 = Tuple{Int, Int}[], B7 = Tuple{Int, Int}[], B8 = Tuple{Int, Int}[], B9 = Tuple{Int, Int}[])

function total_gains_and_losses(partition::DataFrame)
    total_gains = zeros(Int, 9)
    total_losses = zeros(Int, 9)
    for site in eachrow(partition)
        gains, losses = gains_and_losses(site.mp_trajectory)
        total_gains .+= gains
        total_losses .+= losses
    end
    first.(zip.(total_gains, total_losses))
end

push!(evolutionary_changes, ("all sites", total_gains_and_losses(sites)...))
push!(evolutionary_changes, ("synonymous", total_gains_and_losses(sites[sites.synonymous, :])...))
push!(evolutionary_changes, ("sense", total_gains_and_losses(sites[.~sites.synonymous .& .~sites.creates_start .& .~sites.removes_stop, :])...))
push!(evolutionary_changes, ("creates start", total_gains_and_losses(sites[sites.creates_start, :])...))
push!(evolutionary_changes, ("removes stop", total_gains_and_losses(sites[sites.removes_stop, :])...))

evolutionary_changes


Row,partition,B1,B2,B3,B4,B5,B6,B7,B8,B9
Unnamed: 0_level_1,String,Tuple…,Tuple…,Tuple…,Tuple…,Tuple…,Tuple…,Tuple…,Tuple…,Tuple…
1,all sites,"(69, 29)","(23, 23)","(52, 16)","(73, 26)","(56, 31)","(111, 42)","(227, 136)","(312, 539)","(1948, 872)"
2,synonymous,"(52, 7)","(13, 12)","(31, 8)","(46, 8)","(19, 8)","(48, 6)","(13, 19)","(68, 40)","(103, 259)"
3,sense,"(17, 21)","(10, 11)","(21, 8)","(27, 18)","(33, 23)","(62, 32)","(180, 110)","(223, 408)","(1490, 553)"
4,creates start,"(0, 0)","(0, 0)","(0, 0)","(0, 0)","(0, 0)","(1, 0)","(0, 0)","(4, 2)","(16, 5)"
5,removes stop,"(0, 1)","(0, 0)","(0, 0)","(0, 0)","(4, 0)","(0, 4)","(34, 7)","(17, 89)","(339, 55)"


In [27]:
CSV.write("$organelle gains&losses.tsv", evolutionary_changes; delim = '\t')

"mt gains&losses.tsv"