In [23]:
import msprime
import numpy as np
from itertools import combinations

In [25]:
def get_pi(haplotypes):
    ## If no seg sites in a pop then haplotypes will be 0 length
    if haplotypes.size == 0:
        return 0
    n = len(haplotypes[0])
    n_comparisons = float(n) * (n - 1) / 2

    pi = 0
    for hap in haplotypes:
        k = np.count_nonzero(hap)
        pi += float(k) * (n - k) / n_comparisons
    return(pi)


In [132]:
# Run `reps` number of simulations accumulating pi and pi* values then average over reps
def simulate(ss=10, Ne=1e4, length=5e2, reps=100):
    ts_pis = []
    my_pis = []
    uniq_pis = []
    resampled_pis = []
    nuniq = 0
    for i in range(reps):
        tree_sequence = msprime.simulate(sample_size=ss, Ne=Ne, length=length, mutation_rate=1e-7)
        ts_pis.append(tree_sequence.get_pairwise_diversity()/length)

        haps = list(tree_sequence.haplotypes())
        haps_t = np.transpose(np.array([list(map(int, list(x))) for x in haps]))
        my_pis.append(get_pi(haps_t)/length)

        haps = set(haps)
        haps_t = np.transpose(np.array([list(map(int, list(x))) for x in haps]))
        uniq_pis.append(get_pi(haps_t)/length)
        nuniq += len(haps)

        resampled_pis.append(resample(haps, length))

    print("  TS: {}".format(np.mean(ts_pis)))
    print("  My: {}".format(np.mean(my_pis)))
    print("  Uniq (avg# {}): {}".format(np.mean(nuniq),
                                        np.mean(uniq_pis)))
    print("  Resampled to <=5: {}".format(np.mean(resampled_pis)))

    return haps

# Ne 10000, 500bp loci
for ss in [5, 10, 25, 50]:
    print("Sample size {}".format(ss))
    _ = simulate(ss=ss, reps=100)

Sample size 5
  TS: 0.00344
  My: 0.00344
  Uniq (avg# 272.0): 0.005098
  Resampled to <=5: 0.00275946666667
Sample size 10
  TS: 0.00393111111111
  My: 0.00393111111111
  Uniq (avg# 405.0): 0.00604114285714
  Resampled to <=5: 0.00212506666667
Sample size 25
  TS: 0.0037844
  My: 0.0037844
  Uniq (avg# 542.0): 0.00623296825397
  Resampled to <=5: 0.00319906666667
Sample size 50
  TS: 0.00385696326531
  My: 0.00385696326531
  Uniq (avg# 696.0): 0.00642703318903
  Resampled to <=5: 0.00491266666667


In [129]:
def resample(haps, length=5e3, nsamps=50):
    totpi = 0
    minsamps = 5
    if len(haps) < minsamps:
        minsamps = len(haps) - 1
    if minsamps <=1:
        haps_t = np.transpose(np.array([list(map(int, list(x))) for x in haps]))
        return get_pi(haps_t)/length 
    for comb in list(combinations(haps, minsamps))[:10]:
        haps_t = np.transpose(np.array([list(map(int, list(x))) for x in comb]))
        totpi += get_pi(haps_t)/length
    ncombs = len(list(combinations(haps, minsamps)))
    return totpi/float(10)
haps = simulate(ss=50, reps=1)
resample(haps)

  TS: 0.00189551020408
  My: 0.00189551020408
  Uniq (avg# 5.0): 0.006
  Resampled to <=5: 0.0006


5.9999999999999995e-05

In [11]:
# Ne 10000, for 5000bp loci
for ss in [5, 10, 25, 50]:
    print("Sample size {}".format(ss))
    simulate(ss=ss, length=5e4, reps=100)

Sample size 5
  TS: 0.00398404
  My: 0.00398404
  Uniq: 0.00400111333333
Sample size 10
  TS: 0.00371806666667
  My: 0.00371806666667
  Uniq: 0.00374890269841
Sample size 25
  TS: 0.00428718133333
  My: 0.00428718133333
  Uniq: 0.00427695312015
Sample size 50
  TS: 0.00374426187755
  My: 0.00374426187755
  Uniq: 0.00376571732038


In [12]:
for ss in [5, 10, 25, 50]:
    print("Sample size {}".format(ss))
    simulate(ss=ss, Ne=1e5, length=5e3, reps=100)

Sample size 5
  TS: 0.038754
  My: 0.038754
  Uniq: 0.0389556
Sample size 10
  TS: 0.0421256888889
  My: 0.0421256888889
  Uniq: 0.0420385285714
Sample size 25
  TS: 0.0383534266667
  My: 0.0383534266667
  Uniq: 0.0382341022148
Sample size 50
  TS: 0.0396354677551
  My: 0.0396354677551
  Uniq: 0.0402361897382


In [13]:
for ss in [5, 10, 25, 50]:
    print("Sample size {}".format(ss))
    simulate(ss=ss, Ne=1e5, length=5e4, reps=100)

Sample size 5
  TS: 0.04143968
  My: 0.04143968
  Uniq: 0.04143968
Sample size 10
  TS: 0.0401874888889
  My: 0.0401874888889
  Uniq: 0.0401889688889
Sample size 25
  TS: 0.0405720106667
  My: 0.0405720106667
  Uniq: 0.0406147050804
Sample size 50
  TS: 0.0386323533061
  My: 0.0386323533061
  Uniq: 0.0386258339373
