# Simulations for evaluating archaic ancestry inference

## Set paths and import libraries

In [1]:
# run in archanc root directory
import os
proj_dir = os.getcwd()
msprime_dir = proj_dir +"/output/msprime/"
archie_src_dir = proj_dir + "/src/ArchIE/"
archie_out_dir = proj_dir + "/output/ArchIE/"

In [22]:
import sys
sys.path.insert(0, proj_dir + '/src/stdpopsim')

from stdpopsim import homo_sapiens, models
import msprime
import itertools
import random
import numpy as np
import pandas as pd
import math

## define additional demographic models in `stdpopsim` model format

To-do:
- integrate into stdpopsim github repository

In [36]:
class FuTwoPopOutOfAfrica(models.Model):
    def __init__(self):
        super().__init__()
        
        # Times are provided in years, so we convert into generations.
        generation_time = 25

        # 220kya:
        # African population constant with Ne~7300
        N_A = 7310

        # 148kya:
        # instantaneous growth to Ne~14000
        T_AF = 148e3 / generation_time
        N_AF = 14474

        # 51kya:
        # non-AFR pops migrate OOA; bottlenecks to Ne~1800
        # migration between AFR occurs
        N_B = 1861
        T_SPLIT = 51e3 / generation_time
        m_AF_B = 15e-5

        # 23kya:
        # 2nd EUR bottlenecks to Ne~1000 & starts growing with rate 0.307%
        # migration rate slows between AFR-EUR
        N_EU0 = 1032
        T_EU_B = 23e3 / generation_time
        m_AF_EU = 2.5e-5
        r_EU0 = 0.00307
        N_EU1 = N_EU0 / math.exp(-r_EU0 * T_EU_B)

        # 5.1kya:
        # explosive growth in both AFR & EUR
        # Fu 2013  
        T_EG = 5.1e3 / generation_time
        r_EU = 0.0195
        r_AF = 0.0166
        N_EU_start = N_EU1 / math.exp(-r_EU * T_EG)
        m_EG = 0
        N_AF_start = N_AF / math.exp(-r_AF * T_EG)

        # Chen 2015
#         T_EG = 7.26e3 / generation_time 
#         r_EU = 0.0149
#         r_AF = 0.00735
#         N_EU_start = N_EU1 / math.exp(-r_EU * T_EG)
#         m_EG = 0
#         N_AF_start = N_AF / math.exp(-r_AF * T_EG)

        # Gazave 2014
    #     T_EG = 3.52e3 / generation_time 
    #     r_EU = 0.034
    #     r_AF = 0.00735
    #     N_EU = N_EU1 / math.exp(-r_EU * T_EG)
    #     m_EG = 0
    #     N_AF1 = N_AF / math.exp(-r_AF * T_EG)

        # Population IDs correspond to their indexes in the population
        # configuration array. Therefore, we have 0=YRI, 1=CEU initially.
        self.population_configurations = [
            msprime.PopulationConfiguration(
                initial_size=N_AF_start, growth_rate=r_AF),
            msprime.PopulationConfiguration(
                initial_size=N_EU_start, growth_rate=r_EU)#,
        ]

        # up to 5.1kya, no migration
        self.migration_matrix = [
            [0, 0],
            [0, 0],
        ]

        self.demographic_events = [
            # at 5.1kya, change to slow growth rate in EUR & stop growth in AFR;
            # add migration rate
            msprime.MigrationRateChange(
                time=T_EG, rate=m_AF_EU, matrix_index=(0, 1)),
            msprime.MigrationRateChange(
                time=T_EG, rate=m_AF_EU, matrix_index=(1, 0)),
            msprime.PopulationParametersChange(
                time=T_EG, growth_rate=r_EU0, initial_size=N_EU1, population_id=1),
            msprime.PopulationParametersChange(
                time=T_EG, growth_rate=0, population_id=0),

            # at 23kya, EUR growth stops and migration rates increase
            msprime.MigrationRateChange(
                time=T_EU_B, rate=m_AF_B, matrix_index=(0, 1)),
            msprime.MigrationRateChange(
                time=T_EU_B, rate=m_AF_B, matrix_index=(1, 0)),
            msprime.PopulationParametersChange(
                time=T_EU_B, initial_size=N_EU0, growth_rate=0, population_id=1),

            # at 51kya, population B merges into AFR
            msprime.MassMigration(
                time=T_SPLIT, source=1, destination=0, proportion=1.0),
            msprime.PopulationParametersChange(
                time=T_SPLIT, initial_size=N_B, population_id=1),

            # At 148kya, instantaneous growth in AFR
            msprime.PopulationParametersChange(
                time=T_AF, initial_size=N_A, population_id=0)
        ]

## Function for simulating MNMs on tree

Randomly select a fraction of variant sites to be MNMs. The second mutation in each MNM is randomly placed from 1-100bp downstream of the first, and the genotypes of the first mutation are duplicated. 

To-do: separate functions for the MNM simulation from .snp and .geno file generation

In [31]:
def add_mnms(ts, model_label, output_dir="./", rep_label=0, mnm_dist=100, mnm_frac=0.015):
    
    # .snp file names are underscore-delimited with model, population MNM parameters (dash-delimited with distance & fraction), and replicate label
    # .geno file names add the population label
#     prefix = model_label+"_"+pop_label+"_mnm"+str(mnm_dist)+"-"+str(mnm_frac)+"_"+str(rep_label)
    prefix = model_label+"_mnm"+str(mnm_dist)+"-"+str(mnm_frac)+"_"+str(rep_label)
    
    geno = np.zeros(200, dtype=np.int8)
#     eur_geno = np.zeros(100, dtype=np.int8)
#     afr_geno = np.zeros(100, dtype=np.int8)

    mnm_dict = {}
    with open(output_dir + prefix + ".snp", "w") as text_file:
        for variant in list(ts.variants()):
            print("\t".join(["1:"+str(round(variant.site.position)), 
                             "1", 
                             str(variant.site.position/10e6), 
                             str(round(variant.site.position)), 
                             "A", 
                             "G"]), file=text_file)
            geno = np.vstack([geno, variant.genotypes])
            
            random.seed(variant.site.position)
            if random.random() < mnm_frac:

                dist = random.randint(1,mnm_dist)
                mnm_cand = variant.site.position+dist 
                mnm_cand_r = str(round(mnm_cand))
                mnm_dict[mnm_cand_r] = [sum(variant.genotypes[0:100])/100, # allele freq in AFR
                                        sum(variant.genotypes[100:200])/100, # allele freq in EUR
                                        ts.genotype_matrix().shape[0]] # total number of variants before MNMs
                print("\t".join(["1:"+mnm_cand_r, 
                                 "1", 
                                 str(mnm_cand/10e6), 
                                 mnm_cand_r, 
                                 "A", 
                                 "G"]), file=text_file)
                geno = np.vstack([geno, variant.genotypes])

    # output MNM data
    mnm_df = pd.DataFrame(mnm_dict).T
    mnm_df.index.name = "MNM_POS"
    mnm_df = mnm_df.rename(columns={0: "AF_AFR", 
                                    1: "AF_EUR", 
                                    2: "N_TOT"})
    mnm_df.to_csv(output_dir + prefix + ".mnms", 
                  index=True,
                  sep="\t")
    
    # delete empty first row used to initialize genotype matrix            
    geno = np.delete(geno, (0), axis=0)
    
    geno_afr = geno[:,:100]
    geno_eur = geno[:,100:200]
    
    geno_pop = [geno_afr, geno_eur]
    
    for i, pop in enumerate(["afr", "eur"]):
        np.savetxt(output_dir + prefix + "_" + pop + ".geno", geno_pop[i], delimiter="", fmt='%i')

## Define models and parameters

Set up msprime simulations for each of the models we will evaluate

In [37]:
# coalescent simulation parameters
sample_size = 100 # each
length = 50000
mu = 1.15e-8
rr = 1e-8
replicates = 1000
seed = 30

# MNM simulation parameters
mnm_dist = 100
mnm_frac = 0.015

# Gutenkunst 3-population model
GutenkunstThreePop_model = homo_sapiens.GutenkunstThreePopOutOfAfrica()
GutenkunstThreePop_ts = msprime.simulate(
    # first 100 samples from AFR, next 100 from EUR
    samples=[msprime.Sample(0, 0)]*sample_size + [msprime.Sample(1, 0)]*sample_size,
    length=length, 
    mutation_rate=mu, 
    recombination_rate=rr,
    random_seed=seed,
    num_replicates=replicates,
    **GutenkunstThreePop_model.asdict())

# Fu 2-population model
FuTwoPop_model = FuTwoPopOutOfAfrica()
FuTwoPop_ts = msprime.simulate(
    # first 100 samples from AFR, next 100 from EUR
    samples=[msprime.Sample(0, 0)]*sample_size + [msprime.Sample(1, 0)]*sample_size,
    length=length, 
    mutation_rate=mu, 
    recombination_rate=rr,
    random_seed=seed,
    num_replicates=replicates,
    **FuTwoPop_model.asdict())

#-------------------------------------------------------
# define other models here and add to model_dict below
#-------------------------------------------------------

# modify demographic parameters to include archaic branches
# GutenkunstThreePopArchaic_model = homo_sapiens.GutenkunstThreePopArchaic()

# GutenkunstThreePopArchaic_ts = msprime.simulate(
#     # first 100 samples from AFR, next 100 from EUR
#     samples=[msprime.Sample(0, 0)]*sample_size + [msprime.Sample(1, 0)]*sample_size,
#     length=length, 
#     mutation_rate=mu, 
#     recombination_rate=rr,
#     random_seed=seed,
#     num_replicates=replicates,
#     **GutenkunstThreePopArchaic_model.asdict())

model_dict = {"GutenkunstThreePop": GutenkunstThreePop_ts,
             "FuTwoPop": FuTwoPop_ts}

## Run simulations

Simulates 200 samples (100 each of European and African ancestry) under each of the specified models (with 1000 replicates each) and adds MNMs.

Outputs the .mnm, .snp, and .geno files to `output/msprime/` then passes these as input to the `calc_stats_window_data.py` script from `ArchIE` to calculate the per-sample summary stats that act as features in the `ArchIE` inference method. These output files are then concatenated and passed as the test data to `ArchIE/train/train.R` to predict the fraction of archaic ancestry in each simulated sample.


In [None]:
for model_label, model in model_dict.items():
    
    print("Simulating MNMs on " + model_label)
    
    for j, ts in enumerate(model):

        prefix = model_label + "_mnm" + str(mnm_dist) + "-" + str(mnm_frac) + "_"

        add_mnms(ts, 
                 output_dir=msprime_dir, 
                 model_label=model_label, 
                 mnm_dist=mnm_dist, 
                 mnm_frac=mnm_frac, 
                 rep_label=str(j))

        stats_afr_cmd = "python " + archie_src_dir + "data/calc_stats_window_data.py" + \
            " -s " + msprime_dir + prefix + str(j) + ".snp" + \
            " -i " + archie_src_dir + "simulations/out.ADMIXED.ind" + \
            " -a " + msprime_dir + prefix + str(j) + "_afr.geno" + \
            " -r " + msprime_dir + prefix + str(j) + "_eur.geno" + \
            " -c 1 -b 0 -e 50000 -w 50000 -z 50000 " + \
            " > " + archie_out_dir + prefix + str(j) + "_afr.txt"  
#         print(stats_afr_cmd)
        os.system(stats_afr_cmd)

        stats_eur_cmd = "python " + archie_src_dir + "data/calc_stats_window_data.py" + \
            " -s " + msprime_dir + prefix + str(j) + ".snp" + \
            " -i " + archie_src_dir + "simulations/out.ADMIXED.ind" + \
            " -a " + msprime_dir + prefix + str(j) + "_eur.geno" + \
            " -r " + msprime_dir + prefix + str(j) + "_afr.geno" + \
            " -c 1 -b 0 -e 50000 -w 50000 -z 50000 " + \
            " > " + archie_out_dir + prefix + str(j) + "_eur.txt"  
    #     print(stats_eur_cmd)
        os.system(stats_eur_cmd)
        
    cat_afr_cmd = "cat " + archie_out_dir + "*afr.txt" + " > " + archie_out_dir + "combined/" + prefix + "afr.txt" 
    print(cat_afr_cmd)
    os.system(cat_afr_cmd)

    cat_eur_cmd = "cat " + archie_out_dir + "*eur.txt" + " > " + archie_out_dir + "combined/" + prefix + "eur.txt" 
    print(cat_eur_cmd)
    os.system(cat_eur_cmd)

    clean_cmd = "rm " + archie_out_dir + "*"
    print(clean_cmd)
    os.system(clean_cmd)

Simulating MNMs on GutenkunstThreePop
cat /mnt/norbert/home/jedidiah/projects/archanc/output/ArchIE/*afr.txt > /mnt/norbert/home/jedidiah/projects/archanc/output/ArchIE/combined/GutenkunstThreePop_mnm100-0.015_afr.txt
cat /mnt/norbert/home/jedidiah/projects/archanc/output/ArchIE/*eur.txt > /mnt/norbert/home/jedidiah/projects/archanc/output/ArchIE/combined/GutenkunstThreePop_mnm100-0.015_eur.txt
rm /mnt/norbert/home/jedidiah/projects/archanc/output/ArchIE/*
Simulating MNMs on FuTwoPop
