# Simulations for evaluating archaic ancestry inference

## Set paths and import libraries

In [None]:
# run in archanc root directory
import os
proj_dir = os.getcwd()
msprime_dir = proj_dir +"/output/msprime/"
archie_src_dir = proj_dir + "/src/ArchIE/"
archie_out_dir = proj_dir + "/output/ArchIE/"

In [None]:
import sys
sys.path.insert(0, proj_dir + '/src/stdpopsim')

from stdpopsim import homo_sapiens, models
import msprime
import itertools
import random
import numpy as np
import pandas as pd
import math

## define additional demographic models in `stdpopsim` model format

To-do:
- integrate into stdpopsim github repository

In [None]:
class TennessenTwoPopOutOfAfrica(models.Model):
    """
    The model is derived from the Tennesen et al.
    `analysis <https://doi.org/10.1126/science.1219240>`_  of the jSFS from
    European Americans and African Americans.

    Model parameters are taken from Fig. S5 in
    `Fu et al. (2013) <https://doi.org/10.1038/nature11690>`_.

    .. todo:: document this model, including the original publications
        and clear information about what the different population indexes
        mean.

    """

    def __init__(self):
        super().__init__()

        generation_time = 25
        T_AF = 148e3 / generation_time
        T_OOA = 51e3 / generation_time
        T_EU0 = 23e3 / generation_time
        T_EG = 5115 / generation_time

        # Growth rates
        r_EU0 = 0.00307
        r_EU = 0.0195
        r_AF = 0.0166

        # population sizes
        N_A = 7310
        N_AF = 14474
        N_B = 1861
        N_EU0 = 1032
        N_EU1 = N_EU0 / math.exp(-r_EU0 * (T_EU0-T_EG))

        # migration rates
        m_AF_B = 15e-5
        m_AF_EU = 2.5e-5

        # present Ne
        N_EU = N_EU1 / math.exp(-r_EU * T_EG)
        N_AF = N_AF / math.exp(-r_AF * T_EG)

        self.population_configurations = [
            msprime.PopulationConfiguration(initial_size=N_AF, growth_rate=r_AF),
            msprime.PopulationConfiguration(initial_size=N_EU, growth_rate=r_EU)
        ]

        self.migration_matrix = [
            [0, 0],
            [0, 0],
        ]

        self.demographic_events = [
            msprime.MigrationRateChange(
                time=T_EG, rate=m_AF_EU, matrix_index=(0, 1)),
            msprime.MigrationRateChange(
                time=T_EG, rate=m_AF_EU, matrix_index=(1, 0)),
            msprime.PopulationParametersChange(
                time=T_EG, growth_rate=r_EU0, initial_size=N_EU1, population_id=1),
            msprime.PopulationParametersChange(
                time=T_EG, growth_rate=0, initial_size=N_AF, population_id=0),
            msprime.MigrationRateChange(
                time=T_EU0, rate=m_AF_B, matrix_index=(0, 1)),
            msprime.MigrationRateChange(
                time=T_EU0, rate=m_AF_B, matrix_index=(1, 0)),
            msprime.PopulationParametersChange(
                time=T_EU0, initial_size=N_B, growth_rate=0, population_id=1),
            msprime.MassMigration(
                time=T_OOA, source=1, destination=0, proportion=1.0),
            msprime.PopulationParametersChange(
                time=T_AF, initial_size=N_A, population_id=0)
        ]

## Function for simulating MNMs on tree

Randomly select a fraction of variant sites to be MNMs. The second mutation in each MNM is randomly placed from 1-100bp downstream of the first, and the genotypes of the first mutation are duplicated. 

To-do: separate functions for the MNM simulation from .snp and .geno file generation

In [None]:
def add_mnms(ts, model_label, output_dir="./", rep_label=0, mnm_dist=100, mnm_frac=0.015):
    
    # .snp file names are underscore-delimited with model, population MNM parameters (dash-delimited with distance & fraction), and replicate label
    # .geno file names add the population label
#     prefix = model_label+"_"+pop_label+"_mnm"+str(mnm_dist)+"-"+str(mnm_frac)+"_"+str(rep_label)
    prefix = model_label+"_mnm"+str(mnm_dist)+"-"+str(mnm_frac)+"_"+str(rep_label)
    
    geno = np.zeros(200, dtype=np.int8)
#     eur_geno = np.zeros(100, dtype=np.int8)
#     afr_geno = np.zeros(100, dtype=np.int8)

    mnm_dict = {}
    with open(output_dir + prefix + ".snp", "w") as text_file:
        for variant in list(ts.variants()):
            print("\t".join(["1:"+str(round(variant.site.position)), 
                             "1", 
                             str(variant.site.position/10e6), 
                             str(round(variant.site.position)), 
                             "A", 
                             "G"]), file=text_file)
            geno = np.vstack([geno, variant.genotypes])
            
            random.seed(variant.site.position)
            if random.random() < mnm_frac:

                dist = random.randint(1,mnm_dist)
                mnm_cand = variant.site.position+dist 
                mnm_cand_r = str(round(mnm_cand))
                mnm_dict[mnm_cand_r] = [sum(variant.genotypes[0:100])/100, # allele freq in AFR
                                        sum(variant.genotypes[100:200])/100, # allele freq in EUR
                                        ts.genotype_matrix().shape[0]] # total number of variants before MNMs
                print("\t".join(["1:"+mnm_cand_r, 
                                 "1", 
                                 str(mnm_cand/10e6), 
                                 mnm_cand_r, 
                                 "A", 
                                 "G"]), file=text_file)
                geno = np.vstack([geno, variant.genotypes])

    # output MNM data
    mnm_df = pd.DataFrame(mnm_dict).T
    mnm_df.index.name = "MNM_POS"
    mnm_df = mnm_df.rename(columns={0: "AF_AFR", 
                                    1: "AF_EUR", 
                                    2: "N_TOT"})
    mnm_df.to_csv(output_dir + prefix + ".mnms", 
                  index=True,
                  sep="\t")
    
    # delete empty first row used to initialize genotype matrix            
    geno = np.delete(geno, (0), axis=0)
    
    geno_afr = geno[:,:100]
    geno_eur = geno[:,100:200]
    
    geno_pop = [geno_afr, geno_eur]
    
    for i, pop in enumerate(["afr", "eur"]):
        np.savetxt(output_dir + prefix + "_" + pop + ".geno", geno_pop[i], delimiter="", fmt='%i')

## Simulate models

Simulate 200 samples (100 each of European and African ancestry) under each of the specified models (with 1000 replicates each)

In [None]:
# coalescent simulation parameters
sample_size = 100 # each
length = 50000
mu = 1.15e-8
rr = 1e-8
replicates = 1000
seed = 30

# Gutenkunst 3-population model
GutenkunstThreePop_model = homo_sapiens.GutenkunstThreePopOutOfAfrica()
GutenkunstThreePop_ts = msprime.simulate(
    # first 100 samples from AFR, next 100 from EUR
    samples=[msprime.Sample(0, 0)]*sample_size + [msprime.Sample(1, 0)]*sample_size,
    length=length, 
    mutation_rate=mu, 
    recombination_rate=rr,
    random_seed=seed,
    num_replicates=replicates,
    **GutenkunstThreePop_model.asdict())

# Tennessen 2-population model
TennessenTwoPop_model = TennessenTwoPopOutOfAfrica()
TennessenTwoPop_ts = msprime.simulate(
    # first 100 samples from AFR, next 100 from EUR
    samples=[msprime.Sample(0, 0)]*sample_size + [msprime.Sample(1, 0)]*sample_size,
    length=length, 
    mutation_rate=mu, 
    recombination_rate=rr,
    random_seed=seed,
    num_replicates=replicates,
    **TennessenTwoPop_model.asdict())

#-------------------------------------------------------
# define other models here and add to model_dict below
#-------------------------------------------------------

# modify demographic parameters to include archaic branches
# GutenkunstThreePopArchaic_model = homo_sapiens.GutenkunstThreePopArchaic()

# GutenkunstThreePopArchaic_ts = msprime.simulate(
#     # first 100 samples from AFR, next 100 from EUR
#     samples=[msprime.Sample(0, 0)]*sample_size + [msprime.Sample(1, 0)]*sample_size,
#     length=length, 
#     mutation_rate=mu, 
#     recombination_rate=rr,
#     random_seed=seed,
#     num_replicates=replicates,
#     **GutenkunstThreePopArchaic_model.asdict())

model_dict = {"GutenkunstThreePop": GutenkunstThreePop_ts,
             "TennessenTwoPop": TennessenTwoPop_ts}

## Add MNMs

Loop through models/replicates and randomly generate MNMs.

Outputs the .mnm, .snp, and .geno files to `output/msprime/` then passes these as input to the `calc_stats_window_data.py` script from `ArchIE` to calculate the per-sample summary stats that act as features in the `ArchIE` inference method. These output files are then concatenated and passed as the test data to `ArchIE/train/train.R` to predict the fraction of archaic ancestry in each simulated sample.


In [None]:
# MNM simulation parameters
mnm_dist = 100
mnm_frac = 0.015

# change the run_archie flag to True to run ArchIE on the simulated data
run_archie = False

for model_label, model in model_dict.items():
    
    print("Simulating MNMs on " + model_label)
    
    for j, ts in enumerate(model):

        prefix = model_label + "_mnm" + str(mnm_dist) + "-" + str(mnm_frac) + "_"

        # add MNMs and write out .snp and .geno files
        add_mnms(ts, 
                 output_dir=msprime_dir, 
                 model_label=model_label, 
                 mnm_dist=mnm_dist, 
                 mnm_frac=mnm_frac, 
                 rep_label=str(j))

        for pop in ["afr", "eur"]:
            
            # write out separate .ind files per population
            with open(msprime_dir + prefix + pop + "_ADMIXED.ind", "w") as id_file:
                for sampleid in range(0,100):
                    print(prefix + "model_" + str(j) + "_" + pop + "_sample_" + str(sampleid) + \
                         "\tU\tADMIXED",  file=id_file)

            # run archie on AFR and EUR simulated data
            if run_archie:
                if pop == "afr":
                    ref_pop = "eur"
                else:
                    ref_pop = "afr"
                stats_pop_cmd = "python " + archie_src_dir + "data/calc_stats_window_data.py" + \
                    " -s " + msprime_dir + prefix + str(j) + ".snp" + \
                    " -i " + output_dir + prefix + pop + "_ADMIXED.ind" + \
                    " -a " + msprime_dir + prefix + str(j) + "_" + pop + ".geno" + \
                    " -r " + msprime_dir + prefix + str(j) + "_" + ref_pop + ".geno" + \
                    " -c 1 -b 0 -e 50000 -w 50000 -z 50000 " + \
                    " > " + archie_out_dir + prefix + str(j) + "_" + pop + ".txt"  
        #         print(stats_pop_cmd)
                os.system(stats_pop_cmd)
                
        # combine .ind files into single file, for ANCESTRYMAP -> VCF format conversion
        cat_cmd = "cat " + msprime_dir + prefix + "afr_ADMIXED.ind " + \
            msprime_dir + prefix + "eur_ADMIXED.ind > " + msprime_dir + prefix + "combined.ind"
        os.system(cat_cmd)
        
        # add code for converting .snp/.geno/.ind data to 1 VCF per replicate
        # ...
        # ...
    
    # once analyses of all replicates for a given model are complete, combine and clean up ArchIE output
    if run_archie:
        for pop in ["afr", "eur"]:
            cat_pop_cmd = "cat " + \
                archie_out_dir + "*" + pop + ".txt" + \
                " > " + archie_out_dir + "combined/" + prefix + pop + ".txt" 
            print(cat_pop_cmd)
            os.system(cat_pop_cmd)

        # remove ArchIE output for individual replicates
        clean_cmd = "rm " + archie_out_dir + "*"
        print(clean_cmd)
        os.system(clean_cmd)