In [1]:
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# simulate some data which is just the "ancestry" label (5 classes)
l=1e5 # genome length
n=2500 # number of samples to start with
m=10000 # number of samples to generate
data=np.array([[(i % 5) for _ in range(int(l))] for i in range(int(n))])
data.shape

(2500, 100000)

In [5]:
# take number of ancestors as Pois(2.86 * generation_time), over 1-10 generations
maxgen=10
n_splits=2+np.hstack([ss.poisson.rvs(2.86*gen, size=1000//maxgen) for gen in range(1,maxgen)])
# new individuals
new=[]
for j in n_splits:
    if j==0:
        new.append(list(data[np.random.choice(np.arange(n), size=1),:]))
    # sample breakpoints uniformly
    breaks=np.sort(l * ss.beta.rvs(a=1, b=1, size=j)).astype(int)
    # pick founders uniformly at random without replacement and stitch their labels together
    founds=np.random.choice(np.arange(n), size=j+1, replace=False)
    new.append(list(data[founds[0],:breaks[0]]))
    for i,found in enumerate(founds[1:-1]):
        new[-1]+=list(data[found, breaks[i]:breaks[i+1]])
    new[-1]+=list(data[founds[-1], breaks[-1]:])
# done!
data=np.vstack((data,np.array(new)))

In [None]:
# plot ancestry fractions
plt.figure(figsize=(8,8))
plt.imshow(np.vstack((np.sort(data[:250,:], axis=0), np.sort(data[250:,:]))), aspect='auto')
plt.xlabel('ancestry fraction'); plt.ylabel('sample');

In [None]:
# plot "local ancestry"
plt.figure(figsize=(12,4))
plt.imshow(data[245:275,:], aspect='auto')
plt.xlabel('genomic position'); plt.ylabel('sample');

In [None]:
# and counts of breakpoints across pseudo-individuals
plt.hist(n_splits, bins=len(np.unique(n_splits))//2)
plt.xlabel('switch count'); plt.ylabel('number of generated samples');