# Exploring recombination through simulations
### by Jonathan Fischer

In [None]:
from datascience import *
import numpy as np
import random
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from client.api.notebook import Notebook

# You don't need to edit anything in this cell!

# Here's a function that makes sequences for a set number of individuals and 2 populations. This function takes 
# the number of individuals, identity of individuals which are in population 2, and length of the sequence and 
# returns a list of arrays containing each individual's chromosome pairs.

def seq_maker(n_individuals, pop_2_indices, l = 1000):
    # Create sequences for populations
    pop_1 = '1' * l
    pop_2 = '2' * l
    # Intialize list to store individuals
    ind_list = list()
    # Give individual sequence from appropriate population
    for i in np.arange(n_individuals):
        if i in pop_2_indices:
            ind_list.append(make_array(pop_2, pop_2))
        else:
            ind_list.append(make_array(pop_1, pop_1))
    return(ind_list)

# Here's a function that plots "painted" chromosomes for two populations.
# The input is a length 2 array containing a pair of chromosome sequences.

def paint_chroms(chroms):
    # Function that produces a visualization of chromosomes.
    # pop 1 - blue, pop 2 - red
    l = len(chroms[0])
    color_1 = list()
    for i in chroms[0]:
        if i == '1':
            color_1.append('blue')
        elif i == '2':
            color_1.append('red')
    color_2 = list()
    for i in chroms[1]:
        if i == '1':
            color_2.append('blue')
        elif i == '2':
            color_2.append('red')
    plt.scatter(np.arange(l), [1]*l, color = color_1)
    plt.scatter(np.arange(l), [-1]*l, color = color_2)
    plt.ylim(-2, 2)
    plt.show()

## A function to simulate a basic model of recombination

In [None]:
# Complete the function function that simulates the recombination of two chromosomes. 

seq1 = 'A'*1000
seq2 = 'B'*1000

def recombine(seq1, seq2, r = 8):
    # Get the length of the sequences and make sure they match
    l1 = len(seq1)
    l2 = ?
    if ?:
        # Sample the number of recombination points
        n_sites = np.random.poisson(lam = ?) + 1
        # Sample the recombination point coordinates
        sites = random.sample(list(np.arange(?, ?)), ?)
        sites = np.r_[0, np.sort(?), l1-1]
        l_s = len(sites)
        # Initialize the outputs
        out1 = ''
        out2 = ''
        # Iterate through the coordinates and fill in the outputs
        for i in np.arange(?):
            if i % 2 == 0:
                out1 += seq1[sites[i]:sites[i+1]]
                out2 += seq2[sites[i]:sites[i+1]]
            elif ?:
                out1 += ?
                out2 += ?
        return(make_array(out1, out2))
    else:
        return("Sequence lengths don't match")
    
seq1 = 'A'*100 + 'B'*100
seq2 = 'A'*100 + 'A'*100
   
recombine(seq1, seq2)


## Simulating inheritance from grandparents

In [None]:
# Complete the function named inherited_chroms which takes two arrays of two chromosomes as arguments and yields
# a pair of chromosomes that are obtained via recombination. Output should be an array of length two which
# contains the two chromosomes.

def inherited_chroms(parent1, parent2):
    p1 = recombine(parent1[0], ?)
    p2 = recombine(?, ?)
    
    to_inherit = random.choices([0,1], k = 2)
    child_seqs = make_array(p1[to_inherit[?]], ?[?[?]])
    return(child_seqs)

# Write a function that models recombination from grandparents. The input should be a list of length-2 arrays,
# each of which contains a pair of chromosomes.

def grandparents_recomb(gp_list):
    parent_1 = inherited_chroms(gp_list[0], gp_list[1])
    parent_2 = ?
    child = ?
    return(child)

In [None]:
# Example of the grandchild's chromosomes
gp_list = seq_maker(4, [0])
gc_1 = grandparents_recomb(?)
paint_chroms(gc_1)

## What about great-grandparents?

In [None]:
# Model recombination from great-grandparents. Inputs are two lists of chromosomes from grandparents.

def greatgrandparents_recomb(gp_list_1, gp_list_2):
    c_1 = grandparents_recomb(gp_list_1)
    c_2 = ?
    gc_1 = ?
    return(gc_1)

In [None]:
# Generate sequences for grandparents and great-grandparents
gp_list_1 = seq_maker(4, [0])
gp_list_2 = seq_maker(4, [])

# Example of the great-grandchild's chromosomes
ggc_1 = greatgrandparents_recomb(?, ?)
paint_chroms(ggc_1)

## How do things change with different ancestries?

In [None]:
# After completing the previous code, run this cell to compare properties of genetic sequences 
# obtained by starting with 1/4 ancestry from a separate subpopulation, either at the great-grandparent (2/8)
# or grandparent (1/4) levels

# You don't need to edit anything in this cell!

def recomb_summary(chroms):
    ct = 0
    pts = 0
    for x in chroms:
        j = x[0]
        for i in x:
            ct += (i == '2')
            pts += (i != j)
            j = i
    return([.5*ct/len(chroms[0]), pts])

# Make sure the ancestry proportions are the same but occur at different generations.
gp_list = seq_maker(4, [0])
gp_list_1 = seq_maker(4, [0,2])
gp_list_2 = seq_maker(4, [])

summary_mat = np.zeros((1000,4))
for i in np.arange(1000):
    r_gp = recomb_summary(grandparents_recomb(gp_list))
    r_ggp = recomb_summary(greatgrandparents_recomb(gp_list_1, gp_list_2))
    summary_mat[i, 0] = r_gp[0]
    summary_mat[i, 1] = r_ggp[0]
    summary_mat[i, 2] = r_gp[1]
    summary_mat[i, 3] = r_ggp[1]
    
# Plot how much ancestry is inherited from Pop 2 for the two settings (admixture at grandparents or great-grandparents)
plt.hist(summary_mat[:,0], alpha = 0.5, density = True, bins = 20)
plt.hist(summary_mat[:,1], alpha = 0.5, density = True, bins = 20)
plt.legend(['n_gen = 2', 'n_gen = 3'])
plt.xlabel('Proportion')
plt.ylabel('Density')
plt.title('Proportion of sequence inherited from Pop 2')
plt.show()

# Plot number of switches between populations for the two settings (admixture at grandparents or great-grandparents)
plt.hist(summary_mat[:,2], alpha = 0.5, density = True, bins = 15)
plt.hist(summary_mat[:,3], alpha = 0.5, density = True, bins = 20)
plt.legend(['n_gen = 2', 'n_gen = 3'])
plt.xlabel('Count')
plt.ylabel('Density')
plt.title('Number of switches between populations')
plt.show()

## What do you notice about the histograms in each respective plot? Does this make sense? Explain.

## To submit

In [None]:
ok = Notebook('Lab04_Recombination.ok')
_ = ok.auth(inline=True)

In [None]:
# Submit the assignment.
_ = ok.submit()