In [1]:
# Copyright 2019 The Authors. All Rights Reserved.
#
# GNU General Public License v3.0
# Permissions of this strongest copyleft license are conditioned on 
# making available complete source code of licensed works and modifications, 
# which include larger works using a licensed work, under the same license. 
# Copyright and license notices must be preserved. 
# Contributors provide an express grant of patent rights. 
# When a modified version is used to provide a service over a network, 
# the complete source code of the modified version must be made available.
# ==============================================================================

# Title: Simulating the evolution of CRISPR
# Author: Hyunjin Shim
# Date created: 20200727
# Email: jinenstar@gmail.com

# Simulating the evolution of CRISPR
## Step 1:

In [2]:
### packages

# Data
from typing import Dict, List, Tuple
from pathlib import Path 
import glob
import numpy as np

# Biopython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Math
import random

# Plot
import matplotlib.pyplot as plt

In [3]:
### global variables

random.seed(20200727)
nt = ['A','T','G','C']

# mutation rate: based on bacteria
MUTATION_RATE = 1e-7

# generation: based on generation time of bacteria (1 hour per gen: 1e6/24/365=114 years)
GENERATION = 1000000

In [4]:
### functions

# extract information from fasta files
def extract_dataset_info(records: List[SeqRecord]) -> Dict:
    # contains info on SeqIO data
    seqs_id = [r.id for r in records]
    seqs_len = [len(r.seq) for r in records]
    seqs = [str(r.seq) for r in records]
#    int_seqs = [[nt_vocab[nt] for nt in s] for s in seqs]
    d = {"ID":seqs_id, "Seq":seqs, "Length":seqs_len} #"Int_Seq":int_seqs, 
    return d

# simulate mutation on a sequence
# Finite sites model: A model for the process of mutation acting on DNA sequences of finite length so that the same site may experience a mutation more than once
def sim_mutation(times, mu_rate, seq):
    gen = 0 # count generation
    seq_len = len(seq)
    for i in range(times):
        mu = np.random.binomial(seq_len, mu_rate)
        gen += 1
        if mu !=0:
            mu_nt = random.choice(nt)
            mu_position = random.choice(range(seq_len))
            while mu_nt == seq[mu_position]:
                #print("same nt: ", gen, mu_position, seq[mu_position], mu_nt)
                mu_nt = random.choice(nt)
            else:
                #print(gen, mu_position, seq[mu_position], mu_nt)
                string_seq = list(seq)
                string_seq[mu_position] = mu_nt
                seq = "".join(string_seq)
    return(seq)

In [5]:
### data

# location of raw data file
datapath = Path("/Users/jinenstar/Desktop/Data/CRISPR/CRISPR-Cas++/Repeat/Data_DR")

# data dictionary
repeat_data_info = {"TypeI": {}, "TypeII": {}}

# load test sequences
for f in datapath.glob("TypeI?.fa"):
  records = list(SeqIO.parse(str(f), "fasta"))
  key = f.stem
  repeat_data_info["TypeI"][key] = extract_dataset_info(records)

# load test sequences
for f in datapath.glob("TypeII?.fa"):
  records = list(SeqIO.parse(str(f), "fasta"))
  key = f.stem
  repeat_data_info["TypeII"][key] = extract_dataset_info(records)

In [6]:
#print(repeat_data_info.items())
#print(repeat_data_info.keys())
#print(repeat_data_info.values())
print(repeat_data_info["TypeI"].keys())
print(repeat_data_info["TypeII"].keys())

dict_keys(['TypeIB', 'TypeIF', 'TypeIC', 'TypeIU', 'TypeIV', 'TypeID', 'TypeIA', 'TypeIE'])
dict_keys(['TypeIIC', 'TypeIIB', 'TypeIIA'])


In [7]:
test = repeat_data_info["TypeI"]["TypeIA"]["Seq"][0]
m = sim_mutation(GENERATION, MUTATION_RATE, test)
print(test)
print(m)

GTGCTCAACGCCTTACGGCATCAGAGGTTATATCAC
GTGCTCAACGCCTTACGGCCTCAGAGCTTATATCAC


In [8]:
test_seq_evolved_2 = {}

for k in repeat_data_info["TypeII"].keys():
    dataset = repeat_data_info["TypeII"][k]
    test_seq = dataset["Seq"]

    test_seq_evolved_pertype_2 = []
    for i in test_seq:
        test_seq_evolved_pertype_2.append(sim_mutation(GENERATION, MUTATION_RATE, i))
    
    test_seq_evolved_2[k] = test_seq_evolved_pertype_2  

In [9]:
test = test_seq_evolved_2

for k in repeat_data_info["TypeII"].keys():
    with open('Simulated_' + str(k) + '.fa', 'w') as f:
        for i in range(len(test[k])):
            f.write(">" + str(repeat_data_info["TypeII"][k]["ID"][i]) + "\n" + str(test[k][i]) + "\n")      

In [None]:
test_seq_evolved_1 = {}

for k in repeat_data_info["TypeI"].keys():
    dataset = repeat_data_info["TypeI"][k]
    test_seq = dataset["Seq"]

    test_seq_evolved_pertype_1 = []
    for i in test_seq:
        test_seq_evolved_pertype_1.append(sim_mutation(GENERATION, MUTATION_RATE, i))
    
    test_seq_evolved_1[k] = test_seq_evolved_pertype_1
        

In [None]:
test = test_seq_evolved_1

for k in repeat_data_info["TypeI"].keys():
    with open('Simulated_' + str(k) + '.fa', 'w') as f:
        for i in range(len(test[k])):
            f.write(">" + str(repeat_data_info["TypeI"][k]["ID"][i]) + "\n" + str(test[k][i]) + "\n")      
