In [None]:
# Copyright 2021 The Authors. All Rights Reserved.
#
# GNU General Public License v3.0
# Permissions of this strongest copyleft license are conditioned on 
# making available complete source code of licensed works and modifications, 
# which include larger works using a licensed work, under the same license. 
# Copyright and license notices must be preserved. 
# Contributors provide an express grant of patent rights. 
# When a modified version is used to provide a service over a network, 
# the complete source code of the modified version must be made available.
# ==============================================================================

# Title: Simulating the evolution of CRISPR
# Author: Hyunjin Shim
# Date created: 20200727
# Email: jinenstar@gmail.com

# Simulating the evolution of CRISPR

In [85]:
### packages

# Data
from typing import Dict, List, Tuple
from pathlib import Path 
import glob
import numpy as np

# Biopython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Math
import random

# Plot
import matplotlib.pyplot as plt

In [86]:
### global variables

random.seed(20200727)
nt = ['A','T','G','C']

# mutation rate: based on bacteria
MUTATION_RATE = 1e-5

# generation: based on generation time of bacteria (1 hour per gen: 1e6/24/365=114 years)
GENERATION = 10000

In [87]:
### functions

# extract information from fasta files
def extract_dataset_info(records: List[SeqRecord]) -> Dict:
    # contains info on SeqIO data
    seqs_id = [r.id for r in records]
    seqs_len = [len(r.seq) for r in records]
    seqs = [str(r.seq) for r in records]
#    int_seqs = [[nt_vocab[nt] for nt in s] for s in seqs]
    d = {"ID":seqs_id, "Seq":seqs, "Length":seqs_len} #"Int_Seq":int_seqs, 
    return d

# simulate mutation on a sequence
# Finite sites model: A model for the process of mutation acting on DNA sequences of finite length so that the same site may experience a mutation more than once
def sim_mutation(times, mu_rate, seq):
    gen = 0 # count generation
    seq_len = len(seq)
    for i in range(times):
        mu = np.random.binomial(seq_len, mu_rate)
        gen += 1
        if mu !=0:
            mu_nt = random.choice(nt)
            mu_position = random.choice(range(seq_len))
            while mu_nt == seq[mu_position]:
                #print("same nt: ", gen, mu_position, seq[mu_position], mu_nt)
                mu_nt = random.choice(nt)
            else:
                #print(gen, mu_position, seq[mu_position], mu_nt)
                string_seq = list(seq)
                string_seq[mu_position] = mu_nt
                seq = "".join(string_seq)
    return(seq)

In [88]:
### data

# location of raw data file
#datapath = Path("/Users/jinenstar/Desktop/Data/CRISPR/CRISPR-Cas++/Repeat/Data_DR")
datapath = Path("/Users/jinenstar/Desktop/Pro_AE_CRISPR/Data")

# data dictionary
repeat_data_info = {"Class1": {}, "Class2": {}}

# load test sequences
for f in datapath.glob("Class1*.fa"):
  records = list(SeqIO.parse(str(f), "fasta"))
  key = f.stem
  repeat_data_info["Class1"][key] = extract_dataset_info(records)

# load test sequences
for f in datapath.glob("Class2*.fa"):
  records = list(SeqIO.parse(str(f), "fasta"))
  key = f.stem
  repeat_data_info["Class2"][key] = extract_dataset_info(records)

In [89]:
#print(repeat_data_info.items())
#print(repeat_data_info.keys())
#print(repeat_data_info.values())
print(repeat_data_info["Class1"].keys())
print(repeat_data_info["Class2"].keys())

dict_keys(['Class1_5_tps_real_IE_curated copy 62', 'Class1_5_tps_real_IE_curated copy 33', 'Class1_5_tps_real_IE_curated copy 91', 'Class1_5_tps_real_IE_curated copy 56', 'Class1_5_tps_real_IE_curated copy 46', 'Class1_5_tps_real_IE_curated copy 17', 'Class1_5_tps_real_IE_curated copy 81', 'Class1_5_tps_real_IE_curated copy 23', 'Class1_5_tps_real_IE_curated copy 72', 'Class1_5_tps_real_IE_curated copy 52', 'Class1_5_tps_real_IE_curated copy 37', 'Class1_5_tps_real_IE_curated copy 66', 'Class1_5_tps_real_IE_curated copy 95', 'Class1_5_tps_real_IE_curated copy 85', 'Class1_5_tps_real_IE_curated copy 4', 'Class1_5_tps_real_IE_curated copy 76', 'Class1_5_tps_real_IE_curated copy 27', 'Class1_5_tps_real_IE_curated copy 13', 'Class1_5_tps_real_IE_curated copy 42', 'Class1_5_tps_real_IE_curated copy 53', 'Class1_5_tps_real_IE_curated copy 36', 'Class1_5_tps_real_IE_curated copy 67', 'Class1_5_tps_real_IE_curated copy 94', 'Class1_5_tps_real_IE_curated copy 84', 'Class1_5_tps_real_IE_curated 

In [90]:
test = repeat_data_info["Class1"]["Class1_5_tps_real_IE_curated"]["Seq"][0]
m = sim_mutation(GENERATION, MUTATION_RATE, test)
print(test)
print(m)

KeyError: 'Class1_5_tps_real_IE_curated'

In [91]:
test_seq_evolved_2 = {}

for k in repeat_data_info["Class2"].keys():
    dataset = repeat_data_info["Class2"][k]
    test_seq = dataset["Seq"]

    test_seq_evolved_pertype_2 = []
    for i in test_seq:
        test_seq_evolved_pertype_2.append(sim_mutation(GENERATION, MUTATION_RATE, i))
    
    test_seq_evolved_2[k] = test_seq_evolved_pertype_2  

KeyboardInterrupt: 

In [None]:
test = test_seq_evolved_2

for k in repeat_data_info["Class2"].keys():
    with open(str(k) + '.fa', 'w') as f:
        for i in range(len(test[k])):
            f.write(">" + str(repeat_data_info["Class2"][k]["ID"][i]) + "\n" + str(test[k][i]) + "\n")      

In [None]:
test_seq_evolved_1 = {}

for k in repeat_data_info["Class1"].keys():
    dataset = repeat_data_info["Class1"][k]
    test_seq = dataset["Seq"]

    test_seq_evolved_pertype_1 = []
    for i in test_seq:
        test_seq_evolved_pertype_1.append(sim_mutation(GENERATION, MUTATION_RATE, i))
    
    test_seq_evolved_1[k] = test_seq_evolved_pertype_1
        

In [None]:
test = test_seq_evolved_1

for k in repeat_data_info["Class1"].keys():
    with open(str(k) + '.fa', 'w') as f:
        for i in range(len(test[k])):
            f.write(">" + str(repeat_data_info["Class1"][k]["ID"][i]) + "\n" + str(test[k][i]) + "\n")      
