In [1]:
# Copyright 2019 The Authors. All Rights Reserved.
#
# GNU General Public License v3.0
# Permissions of this strongest copyleft license are conditioned on 
# making available complete source code of licensed works and modifications, 
# which include larger works using a licensed work, under the same license. 
# Copyright and license notices must be preserved. 
# Contributors provide an express grant of patent rights. 
# When a modified version is used to provide a service over a network, 
# the complete source code of the modified version must be made available.
# ==============================================================================

# Title: Simulating the evolution of CRISPR
# Author: Hyunjin Shim
# Date created: 20200727
# Email: jinenstar@gmail.com

# Simulating the evolution of CRISPR
## Step 1:

In [14]:
# packages

# Data
from typing import Dict, List, Tuple
from pathlib import Path 
import glob

# Biopython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [None]:
# global variables
BACTERIA_M = 1e-7

In [21]:
# functions

def extract_dataset_info(records: List[SeqRecord]) -> Dict:
    # contains info on SeqIO data
    seqs_id = [r.id for r in records]
    seqs_len = [len(r.seq) for r in records]
    seqs = [str(r.seq) for r in records]
#    int_seqs = [[nt_vocab[nt] for nt in s] for s in seqs]
    d = {"ID":seqs_id, "Seq":seqs, "Length":seqs_len} #"Int_Seq":int_seqs, 
    return d

In [34]:
# data

# location of raw data file
datapath = Path("/Users/jinenstar/Desktop/Data/CRISPR/CRISPR-Cas++/Repeat/Data_DR")

# data dictionary
repeat_data_info = {"TypeI": {}, "TypeII": {}}

# load test sequences
for f in datapath.glob("TypeI?.fa"):
  records = list(SeqIO.parse(str(f), "fasta"))
  key = f.stem
  repeat_data_info["TypeI"][key] = extract_dataset_info(records)

# load test sequences
for f in datapath.glob("TypeII?.fa"):
  records = list(SeqIO.parse(str(f), "fasta"))
  key = f.stem
  repeat_data_info["TypeII"][key] = extract_dataset_info(records)

In [36]:
#print(repeat_data_info.items())
#print(repeat_data_info.keys())
#print(repeat_data_info.values())
print(repeat_data_info["TypeI"].keys())
print(repeat_data_info["TypeII"].keys())

dict_keys(['TypeIB', 'TypeIF', 'TypeIC', 'TypeIU', 'TypeIV', 'TypeID', 'TypeIA', 'TypeIE'])
dict_keys(['TypeIIC', 'TypeIIB', 'TypeIIA'])


In [None]:
# Finite sites model: A model for the process of mutation acting on DNA sequences of finite length so that the same site may experience a mutation more than once

