In [1]:
import tskit
import pyslim
import msprime
import dendropy
import glob
import numpy as np
import pandas as pd
import warnings
import functools
import argparse
import operator
import os


In [2]:
parser = argparse.ArgumentParser(description='Gets stats from unioned tree sequence')
parser.add_argument('rand_id', type=str)
parser.add_argument('rep', type=str)
parser.add_argument('win_size', type=lambda x: int(float(x)))
parser.add_argument('sample_size', type=int)
parser.add_argument('coords_dict', type=str, help="String of a dictionary with padded and non-padded start and ends of the chromosomic region. Assumes one chromosome only!")
parser.add_argument('--seed', type=int, default=8991, required=False)

_StoreAction(option_strings=['--seed'], dest='seed', nargs=None, const=None, default=8991, type=<class 'int'>, choices=None, help=None, metavar=None)

In [3]:
args = vars(parser.parse_args(["GDVOP9EMEV6ZLEG","0","1000000","10","{'chr': 'chr12', 'start': 60000000, 'end': 70000000, 'padded_start': 60000000, 'padded_end': 70000000}"]))
coords_dict = eval(args['coords_dict'])
out_path = "../../output/"

In [4]:
coords_dict = eval(args['coords_dict'])

In [5]:
# Loading tree sequence and list with populations
recap_mut_path = f"{out_path}{args['rand_id']}/{args['rand_id']}_rep{args['rep']}.union.recap.mut.trees"
pops_path = f"{out_path}{args['rand_id']}/{args['rand_id']}_rep{args['rep']}.pops"
assert os.path.exists(recap_mut_path) and os.path.exists(pops_path), f"Trees file or .pops file does not exist for {args['rand_id']}_{args['rep']}"
recap_tsu = pyslim.load(recap_mut_path)
with open(pops_path, "r") as f:
    pops = eval(f.readline())

rng = np.random.default_rng(args['seed'])

In [6]:
rng = np.random.default_rng(args['seed'])
# getting contemporary samples
# note the time of "contemporary" samples varies bc of differences in generation times
# TODO: sample individuals not nodes
pop_samples = [recap_tsu.samples(population_id=i+1) for i in range(len(pops))]
contemp_time = [np.min(recap_tsu.tables.nodes.time[samples]) for samples in pop_samples]
contemp_samples = [rng.choice(pop_samples[pid][recap_tsu.tables.nodes.time[pop_samples[pid]] == contemp_time[pid]], args["sample_size"], replace=False) for pid in range(len(pop_samples))]

In [17]:
sample_sets = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
n = np.array([len(s) for s in sample_sets], dtype='float')
num_pops = len(sample_sets)

twoway = [[x,y] for x in range(num_pops) for y in range(num_pops) if x>=y]
fourway = np.array([(twoway[xx] + twoway[yy]) for xx in range(len(twoway)) for yy in range(len(twoway)) if xx>=yy], dtype='int')

i, j, k, l = [fourway[:,x] for x in range(4)]


In [28]:
fourway.shape[0]

21

In [22]:
def f(x):
    numer = x[i] * (n[j] - x[j]) * x[k] * (n[k] - x[k])
    denom = n[i] * (n[j] - (i == j)) * n[k] * (n[l] - (k == l))
    return numer / denom

In [23]:
recap_tsu.sample_count_stat(sample_sets=sample_sets, f=f, output_dim=fourway.shape[0])

array([0.00046763, 0.00031513, 0.0002252 , 0.00024142, 0.00033789,
       0.00050683, 0.00031902, 0.0001718 , 0.00025771, 0.00022189,
       0.00024064, 0.00022478, 0.00033717, 0.00022124, 0.00022124,
       0.00023503, 0.0001625 , 0.00024374, 0.00033202, 0.00033202,
       0.00049803])

In [None]:
import collections


In [None]:
rank_counts = collections.Counter(t.rank() for t in tssimp.trees())

In [None]:
most_common = rank_counts.most_common(100)

In [None]:
most_common[0][0]

In [None]:
trees = [tskit.Tree.unrank(tssimp.num_samples, mc[0]) for mc in most_common]

In [None]:
tssimp.num_trees

In [None]:
from IPython.display import SVG, display, HTML
for i in range(10):
    print(most_common[i])
    display(SVG(trees[i].draw_svg(node_labels = node_labels)))

In [None]:
multi_tree_str = ""

In [None]:
for tree in trees:
    multi_tree_str += tree.as_newick(include_branch_lengths=False, node_labels = node_labels)
    multi_tree_str += "\n"

In [None]:
multi_tree_str

In [None]:
import toytree

In [None]:
mtre0 = toytree.mtree(multi_tree_str)

In [None]:
canvas, axes, mark = mtre0.draw_cloud_tree(
    edge_style={
        "stroke-opacity": 0.1,
        "stroke-width": 1,
    },
);

In [None]:
# pull out the trees and get a densiplot

In [None]:
# plot the residuals of those correlations against branch lengths/N or sth? that would help us there.