## define additional demographic models in `stdpopsim` model format

To-do:
- integrate into stdpopsim github repository

In [None]:
class TennessenTwoPopOutOfAfrica(models.Model):
    """
    The model is derived from the Tennesen et al.
    `analysis <https://doi.org/10.1126/science.1219240>`_  of the jSFS from
    European Americans and African Americans.

    Model parameters are taken from Fig. S5 in
    `Fu et al. (2013) <https://doi.org/10.1038/nature11690>`_.

    .. todo:: document this model, including the original publications
        and clear information about what the different population indexes
        mean.

    """

    def __init__(self):
        super().__init__()

        generation_time = 25
        T_AF = 148e3 / generation_time
        T_OOA = 51e3 / generation_time
        T_EU0 = 23e3 / generation_time
        T_EG = 5115 / generation_time

        # Growth rates
        r_EU0 = 0.00307
        r_EU = 0.0195
        r_AF = 0.0166

        # population sizes
        N_A = 7310
        N_AF = 14474
        N_B = 1861
        N_EU0 = 1032
        N_EU1 = N_EU0 / math.exp(-r_EU0 * (T_EU0-T_EG))

        # migration rates
        m_AF_B = 15e-5
        m_AF_EU = 2.5e-5

        # present Ne
        N_EU = N_EU1 / math.exp(-r_EU * T_EG)
        N_AF = N_AF / math.exp(-r_AF * T_EG)

        self.population_configurations = [
            msprime.PopulationConfiguration(initial_size=N_AF, growth_rate=r_AF),
            msprime.PopulationConfiguration(initial_size=N_EU, growth_rate=r_EU)
        ]

        self.migration_matrix = [
            [0, 0],
            [0, 0],
        ]

        self.demographic_events = [
            msprime.MigrationRateChange(
                time=T_EG, rate=m_AF_EU, matrix_index=(0, 1)),
            msprime.MigrationRateChange(
                time=T_EG, rate=m_AF_EU, matrix_index=(1, 0)),
            msprime.PopulationParametersChange(
                time=T_EG, growth_rate=r_EU0, initial_size=N_EU1, population_id=1),
            msprime.PopulationParametersChange(
                time=T_EG, growth_rate=0, initial_size=N_AF, population_id=0),
            msprime.MigrationRateChange(
                time=T_EU0, rate=m_AF_B, matrix_index=(0, 1)),
            msprime.MigrationRateChange(
                time=T_EU0, rate=m_AF_B, matrix_index=(1, 0)),
            msprime.PopulationParametersChange(
                time=T_EU0, initial_size=N_B, growth_rate=0, population_id=1),
            msprime.MassMigration(
                time=T_OOA, source=1, destination=0, proportion=1.0),
            msprime.PopulationParametersChange(
                time=T_AF, initial_size=N_A, population_id=0)
        ]

In [None]:
def no_mnms(ts, model_label, output_dir="./", rep_label=0):
    prefix = model_label + "_" + str(rep_label)
    
    geno = np.zeros(200, dtype=np.int8)
#     eur_geno = np.zeros(100, dtype=np.int8)
#     afr_geno = np.zeros(100, dtype=np.int8)

    mnm_dict = {}
    with open(output_dir + prefix + ".snp", "w") as text_file:
        for variant in list(ts.variants()):
            print("\t".join(["1:"+str(round(variant.site.position)), 
                             "1", 
                             str(variant.site.position/10e6), 
                             str(round(variant.site.position)), 
                             "A", 
                             "G"]), file=text_file)
            geno = np.vstack([geno, variant.genotypes])

    # delete empty first row used to initialize genotype matrix            
    geno = np.delete(geno, (0), axis=0)
    
    geno_afr = geno[:,:100]
    geno_eur = geno[:,100:200]
    
    geno_pop = [geno_afr, geno_eur]
    
    for i, pop in enumerate(["afr", "eur"]):
        np.savetxt(output_dir + prefix + "_" + pop + ".geno", geno_pop[i], delimiter="", fmt='%i')

    return prefix

In [None]:
def add_mnms(ts, model_label, output_dir="./", rep_label=0, mnm_dist=100, mnm_frac=0.015):
    
    # .snp file names are underscore-delimited with model, population MNM parameters (dash-delimited with distance & fraction), and replicate label
    # .geno file names add the population label
#     prefix = model_label+"_"+pop_label+"_mnm"+str(mnm_dist)+"-"+str(mnm_frac)+"_"+str(rep_label)
    prefix = model_label+"_"+str(rep_label)+"_mnm"+str(mnm_dist)+"-"+str(mnm_frac)
    
    geno = np.zeros(200, dtype=np.int8)

    mnm_dict = {}
    with open(output_dir + prefix + ".snp", "w") as text_file:
        for variant in list(ts.variants()):
            print("\t".join(["1:"+str(round(variant.site.position)), 
                             "1", 
                             str(variant.site.position/10e6), 
                             str(round(variant.site.position)), 
                             "A", 
                             "G"]), file=text_file)
            geno = np.vstack([geno, variant.genotypes])
            
            random.seed(variant.site.position)
            if random.random() < mnm_frac:

                dist = random.randint(1,mnm_dist)
                mnm_cand = variant.site.position+dist 
                mnm_cand_r = str(round(mnm_cand))
                mnm_dict[mnm_cand_r] = [sum(variant.genotypes[0:100])/100, # allele freq in AFR
                                        sum(variant.genotypes[100:200])/100, # allele freq in EUR
                                        ts.genotype_matrix().shape[0]] # total number of variants before MNMs
                print("\t".join(["1:"+mnm_cand_r, 
                                 "1", 
                                 str(mnm_cand/10e6), 
                                 mnm_cand_r, 
                                 "A", 
                                 "G"]), file=text_file)
                geno = np.vstack([geno, variant.genotypes])

    # output MNM data
    mnm_df = pd.DataFrame(mnm_dict).T
    mnm_df.index.name = "MNM_POS"
    mnm_df = mnm_df.rename(columns={0: "AF_AFR", 
                                    1: "AF_EUR", 
                                    2: "N_TOT"})
    mnm_df.to_csv(output_dir + prefix + ".mnms", 
                  index=True,
                  sep="\t")
    
    # delete empty first row used to initialize genotype matrix            
    geno = np.delete(geno, (0), axis=0)
    
    geno_afr = geno[:,:100]
    geno_eur = geno[:,100:200]
    
    geno_pop = [geno_afr, geno_eur]
    
    for i, pop in enumerate(["afr", "eur"]):
        np.savetxt(output_dir + prefix + "_" + pop + ".geno", geno_pop[i], delimiter="", fmt='%i')
        
    return prefix

## Add MNMs

Loop through models/replicates and randomly generate MNMs.

Outputs the .mnm, .snp, and .geno files to `output/msprime/` then passes these as input to the `calc_stats_window_data.py` script from `ArchIE` to calculate the per-sample summary stats that act as features in the `ArchIE` inference method. These output files are then concatenated and passed as the test data to `ArchIE/train/train.R` to predict the fraction of archaic ancestry in each simulated sample.


In [None]:
# MNM simulation parameters
mnm_dist = 100
mnm_frac = 0.015

# change the run_archie flag to True to run ArchIE on the simulated data
run_archie = False

for model_label, model in model_dict.items():
    
    print("Simulating MNMs on " + model_label)
    
    for j, ts in enumerate(model):
        
        prefix = model_label + \
            "_mnm" + str(mnm_dist) + "-" + str(mnm_frac) + "_"
        
        no_mnm_prefix = no_mnms(ts,
               output_dir=msprime_dir,
               model_label=model_label,
               rep_label=str(j))
        
        # add MNMs and write out .snp and .geno files
        mnm_prefix = add_mnms(ts, 
                 output_dir=msprime_dir, 
                 model_label=model_label, 
                 mnm_dist=mnm_dist, 
                 mnm_frac=mnm_frac, 
                 rep_label=str(j))
        
        for prefix in [no_mnm_prefix, mnm_prefix]:
            for pop in ["afr", "eur"]:
                # write out separate .ind files per population.
                # columns indicate sample ID, sex (set as 'U'), and label (set as 'ADMIXED')
                ind_file = msprime_dir + prefix + pop + "_ADMIXED.ind"
                with open(ind_file, "w") as id_file:
                    for sample_id in range(0,100):
                        sample_name = model_label + \
                            "_mnm" + str(mnm_dist) + "-" + str(mnm_frac) + \
                            "_model_" + str(j) + "_" + pop + \
                            "_sample_" + str(sample_id)
                        print(sample_name + "\tU\tADMIXED",  file=id_file)

                # run archie on AFR and EUR simulated data
                if run_archie:
                    if pop == "afr":
                        ref_pop = "eur"
                    else:
                        ref_pop = "afr"
                    stats_pop_cmd = "python " + archie_src_dir + "data/calc_stats_window_data.py" + \
                        " -s " + msprime_dir + prefix + str(j) + ".snp" + \
                        " -i " + output_dir + prefix + pop + "_ADMIXED.ind" + \
                        " -a " + msprime_dir + prefix + str(j) + "_" + pop + ".geno" + \
                        " -r " + msprime_dir + prefix + str(j) + "_" + ref_pop + ".geno" + \
                        " -c 1 -b 0 -e 50000 -w 50000 -z 50000 " + \
                        " > " + archie_out_dir + prefix + str(j) + "_" + pop + ".txt"  
            #         print(stats_pop_cmd)
                    os.system(stats_pop_cmd)

            # combine .ind files into single file, for ANCESTRYMAP -> VCF format conversion
            cat_cmd = "cat " + msprime_dir + prefix + "afr_ADMIXED.ind " + \
                msprime_dir + prefix + "eur_ADMIXED.ind > " + msprime_dir + prefix + "combined.ind"
            os.system(cat_cmd)
        
        # add code for converting .snp/.geno/.ind data to 1 VCF per replicate
        # ...
        # ...
    
    # once analyses of all replicates for a given model are complete, combine and clean up ArchIE output
    if run_archie:
        for pop in ["afr", "eur"]:
            cat_pop_cmd = "cat " + \
                archie_out_dir + "*" + pop + ".txt" + \
                " > " + archie_out_dir + "combined/" + prefix + pop + ".txt" 
            print(cat_pop_cmd)
            os.system(cat_pop_cmd)

        # remove ArchIE output for individual replicates
        clean_cmd = "rm " + archie_out_dir + "*"
        print(clean_cmd)
        os.system(clean_cmd)

In [None]:
# ts_rep = msprime.simulate(
#     # first 100 samples from AFR, next 100 from EUR
#     samples=[msprime.Sample(0, 0)]*sample_size + [msprime.Sample(1, 0)]*sample_size,
#     length=length, 
#     mutation_rate=mu, 
#     recombination_rate=rr,
#     random_seed=seed,
#     num_replicates=replicates,
#     **model.asdict())

# ts_rep2 = msprime.simulate(
#     # first 100 samples from AFR, next 100 from EUR
#     samples=[msprime.Sample(0, 0)]*sample_size + [msprime.Sample(1, 0)]*sample_size,
#     length=length, 
#     mutation_rate=mu, 
#     recombination_rate=rr,
#     random_seed=40,
#     num_replicates=replicates,
#     **model.asdict())

# ts_list = [ts_rep, ts_rep2]

# for ts_repi in ts_list:
#     print(ts_repi)

In [20]:
# sum([0,1,2,3][2:4])/10
random.seed(10)

AttributeError: 'builtin_function_or_method' object has no attribute 'seed'

In [31]:
#         cat_afr_cmd = "cat " + archie_out_dir + "*afr.txt" + " > " + archie_out_dir + "combined/" + prefix + "afr.txt" 
#         print(cat_afr_cmd)
#         os.system(cat_afr_cmd)

#         cat_eur_cmd = "cat " + archie_out_dir + "*eur.txt" + " > " + archie_out_dir + "combined/" + prefix + "eur.txt" 
#         print(cat_eur_cmd)
#         os.system(cat_eur_cmd)

#         clean_cmd = "rm " + archie_out_dir + "*"
#         print(clean_cmd)
#         os.system(clean_cmd)

cat /mnt/norbert/home/jedidiah/projects/archanc/output/ArchIE/*afr.txt > /mnt/norbert/home/jedidiah/projects/archanc/output/ArchIE/combined/GutenkunstThreePop_mnm100-0.015_afr.txt
cat /mnt/norbert/home/jedidiah/projects/archanc/output/ArchIE/*eur.txt > /mnt/norbert/home/jedidiah/projects/archanc/output/ArchIE/combined/GutenkunstThreePop_mnm100-0.015_eur.txt
rm /mnt/norbert/home/jedidiah/projects/archanc/output/ArchIE/*


256

# Sandbox

In [None]:
# ts_afr = msprime.simulate(
#     samples=[msprime.Sample(0, 0)]*100,
#     length=50000, mutation_rate=1.25e-8, random_seed=30, recombination_rate=1e-8,
#     **model.asdict())

# ts_eur = msprime.simulate(
#     samples=[msprime.Sample(1, 0)]*100,
#     length=50000, mutation_rate=1.25e-8, random_seed=30, recombination_rate=1e-8,
#     **model.asdict())

# tree=ts.first()
print(ts_afr.first().draw(format="unicode"))

In [None]:
# # simulate MNMs in AFR
# afr_geno = np.zeros(100)
# # A_new = np.array([])
# # for variant in list(itertools.islice(ts_eur.variants(), 200)):
# with open("afr_gutenkunst_mnm_100.snp", "w") as text_file:
#     for variant in list(ts_eur.variants()):
#         print("\t".join(["1:"+str(round(variant.site.position)), "1", str(variant.site.position/10e6), str(round(variant.site.position)), "A", "G"]), file=text_file)
#         afr_geno = np.vstack([afr_geno, variant.genotypes])
#     #     variant.genotypes, 
#         if random() < 0.015:
#             dist = randint(1,100)
#             mnm_cand = variant.site.position+dist 

#             print("\t".join(["1:"+str(round(mnm_cand))+"*", "1", str(mnm_cand/10e6), str(round(mnm_cand)), "A", "G"]), file=text_file)
#             afr_geno = np.vstack([afr_geno, variant.genotypes])
            
# # delete empty first row used to initialize genotype matrix            
# afr_geno = np.delete(afr_geno, (0), axis=0)
# np.savetxt("afr_gutenkunst_mnm_100.geno", afr_geno, delimiter="\t")
    
# simulate MNMs in EUR
eur_geno = np.zeros(10, dtype=np.int8)
# A_new = np.array([])
# for variant in list(itertools.islice(ts_eur.variants(), 200)):
# with open("eur_gutenkunst_mnm_100.snp", "w") as text_file:
for variant in list(ts_eur.variants()):
#         print("\t".join(["1:"+str(round(variant.site.position)), "1", str(variant.site.position/10e6), str(round(variant.site.position)), "A", "G"]), file=text_file)
    eur_geno = np.vstack([eur_geno, variant.genotypes])
#     variant.genotypes, 
    if random() < 0.015:
        dist = randint(1,100)
        mnm_cand = variant.site.position+dist 

#             print("\t".join(["1:"+str(round(mnm_cand))+"*", "1", str(mnm_cand/10e6), str(round(mnm_cand)), "A", "G"]), file=text_file)
        eur_geno = np.vstack([eur_geno, variant.genotypes])

# delete empty first row used to initialize genotype matrix
eur_geno = np.delete(eur_geno, (0), axis=0)
np.savetxt("eur_gutenkunst_mnm_100.geno", eur_geno, delimiter="\t")

In [25]:
eur_geno = np.empty([10, 20], dtype=np.int8)
eur_geno[:,10:20]

array([[1, 1, 0, 1, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 1, 1, 1, 0, 1],
       [0, 1, 1, 1, 1, 0, 1, 0, 0, 0],
       [1, 1, 1, 0, 1, 0, 1, 0, 1, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 1, 1, 0, 1, 0, 0, 1, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int8)

In [80]:
test_arr = np.array([[  0, -11,  -2, -36,  -9,  85,   0,   0,   1,   1],
       [  1,   1,   1,   0,   0,   1,   1,   0,   0,   0],
       [  0,   1,   0,   0,   1,   0,   0,   0,   0,   0],
       [  0,   0,   1,   0,   0,   0,   1,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   1,   0,   0,   1],
       [  0,   0,   1,   0,   0,   0,   0,   1,   0,   1],
       [  0,   0,   0,   0,   0,   0,   1,   0,   0,   0],
       [  0,   1,   1,   0,   1,   0,   1,   0,   1,   1],
       [  1,   1,   1,   1,   0,   0,   1,   0,   0,   0]], dtype="int8")

In [83]:
test_arr.shape[0]

9

In [None]:
# ts_rep = msprime.simulate(
# #         samples=[msprime.Sample(i, 0)]*100,
#     samples=[msprime.Sample(gp, 0) for gp in range(2)]*100,
#     length=50000, 
#     mutation_rate=1.25e-8, 
#     recombination_rate=1e-8,
#     random_seed=30,
#     num_replicates=5,
#     **model.asdict())

# # np.repeat([msprime.Sample(gp, 0) for gp in range(2)], 100)

In [None]:
# for ts in ts_rep:
#     print(ts.genotype_matrix().shape)
# #     for node in ts.nodes():
# # #         print(1)
# # #         print(node.flags())
# #         if node.population==1:
# #             print(node)

In [None]:
for node in ts_eur.nodes():
    print(node)

In [None]:
ts_afr_rep = msprime.simulate(
    samples=[msprime.Sample(0, 0)]*100,
    length=50000, 
    mutation_rate=1.25e-8, 
    recombination_rate=1e-8,
    random_seed=30,
    num_replicates=5,
    **model.asdict())

for j, ts in enumerate(ts_afr_rep):
    add_mnms(ts, "gutenkunst_rep"+str(j), "afr", 100, 0.015)

    
# ts_eur_rep = msprime.simulate(
#     samples=[msprime.Sample(1, 0)]*100,
#     length=50000, 
#     mutation_rate=1.25e-8, 
#     recombination_rate=1e-8,
#     random_seed=30,
#     num_replicates=5,
#     **model.asdict())

## tweaking code from ArchIE

In [None]:
ref_geno = []
with open("/mnt/norbert/home/jedidiah/projects/primeval/gutenkunst_eur_mnm100-0.015_0.geno", 'r') as f:
        for line in f:
            ll = list(line)[:-1]
            li = [int(i) for i in ll]
            ref_geno.append(li)


In [None]:
len(ref_geno[1])

In [None]:
geno = []
with open("/mnt/norbert/home/jedidiah/projects/primeval/gutenkunst_afr_mnm100-0.015_0.geno", 'r') as f:
        for line in f:
            ll = list(line)[:-1]
            li = [int(i) for i in ll]
            geno.append(li)

In [None]:
import sklearn.metrics.pairwise
f = np.array(geno)[:,1]
t_r = np.transpose(np.array(ref_geno))
d = []
for r in t_r:
    r_np = np.array(r)
    arr = np.array([f, r_np])
    d.append(np.max(sklearn.metrics.pairwise.pairwise_distances(arr)))


In [None]:
arr

## Playing with recombination breakpoints

In [None]:
print(list(ts.breakpoints())[1:10])

In [None]:
from bisect import bisect_left

def takeClosest(myList, myNumber):
    """
    Assumes myList is sorted. Returns closest value to myNumber.

    If two numbers are equally close, return the smallest number.
    """
    pos = bisect_left(myList, myNumber)
    if pos == 0:
        return myList[0]
    if pos == len(myList):
        return myList[-1]
    before = myList[pos - 1]
    after = myList[pos]
    return after
#     if after - myNumber < myNumber - before:
#        return after
#     else:
#        return before

In [None]:
takeClosest(list(ts.breakpoints()), 2813)

In [7]:
import math
1032 / math.exp(-0.00307 * 23000/25)

17390.058059971307