In [1]:
# Copyright 2019 The Authors. All Rights Reserved.
#
# GNU General Public License v3.0
# Permissions of this strongest copyleft license are conditioned on 
# making available complete source code of licensed works and modifications, 
# which include larger works using a licensed work, under the same license. 
# Copyright and license notices must be preserved. 
# Contributors provide an express grant of patent rights. 
# When a modified version is used to provide a service over a network, 
# the complete source code of the modified version must be made available.
# ==============================================================================

# Title: Data labeling of spacers from public
# Author: Hyunjin Shim
# Date created: 20200218
# Email: jinenstar@gmail.com

# Processing CRISPR-Cas++ spacers

- to process all spacers from classified database (matching names to sequences)

# Dataset description

- Classified: spacers classified by Type from CRISPR-Cas++

In [1]:
# Data
import os
import pandas as pd
import numpy as np

# Biopython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Regular expression
import re

# Plot
from pylab import *
import pylab as pylab
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
import matplotlib.patches as mpatches
%matplotlib inline
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split

# location of raw data file
datapath = '/Users/jinenstar/Desktop/Data/CRISPR/CRISPR-Cas++/Spacer'
os.chdir(datapath)

In [2]:
# use Biopython to process and parse sequences
file_in ='20190618_spacer_34.fasta'

with open(file_in) as fasta_file:  # Will close handle cleanly
    all_ID_spacer = []
    all_sequence_spacer = []
    all_length_spacer = []
    all_des_spacer = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        # remove .id from .description record (remove all before first space)
        #seq_record.description=' '.join(seq_record.description.split()[1:])
        # a list of IDs and sequences
        all_des_spacer.append(seq_record.description)
        all_ID_spacer.append(seq_record.id)
        all_sequence_spacer.append(seq_record.seq)
        all_length_spacer.append(len(seq_record.seq))

# find maximum seq length
max_length_spacer = max(all_length_spacer)
min_length_spacer = min(all_length_spacer)

In [3]:
all_ID_spacer

['CP007511.1',
 'CP001836.1',
 'CP004353.1',
 'CP030241.1',
 'LN890520.1|LN890522.1|LN890518.1|CP019410.1|CP014996.1',
 'LT618792.1|LT618781.1|LT618791.1|LT576033.1|LT618783.1|LT618782.1',
 'AP009552.1',
 'CP036455.1',
 'CP028858.1',
 'AP017295.1',
 'ABIZ01000001.1',
 'CP034831.1|CP037960.1|LR134158.1',
 'CP003537.1',
 'CP001804.1',
 'CP025541.2',
 'CP011996.1',
 'CM000955.1',
 'CP028299.1|CP028297.1|CP028295.1',
 'CP012943.1',
 'CP013292.1|CP010848.1',
 'AP014938.1',
 'CP025777.1|CP025706.1',
 'AP006878.1',
 'CP013614.1',
 'CP029033.1',
 'AP009044.1',
 'CP022656.1',
 'LT629764.1',
 'LR590464.1',
 'CP001804.1',
 'CP019698.1',
 'CP009961.1',
 'CP036514.1',
 'CP024278.1|CP024299.1|CP023349.1|CP027371.1|CP010122.1|CP024293.1|CP027449.1|CP024240.1',
 'CP000909.1|CP001364.1',
 'CP001337.1',
 'CP013217.1',
 'CP001338.1',
 'CP031460.1|CP033190.1|CP031464.1|CP013670.1|CP031698.1|CP018087.1|CP031458.1|CP033189.1|AP008229.1|CP031459.1|CP033188.1|CP031457.1|CP033191.1|CP031469.1|CP013678.1',
 'CP

In [4]:
file_in = 'Cas1.txt'

data_df_Cas1 = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_Cas1

file_in = 'orphanCRISPR.txt'
data_df_orphan = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_orphan

Unnamed: 0,0
0,AP018515.1
1,CP022699.1
2,CP017448.1
3,CP026328.1
4,CP005986.1
5,CP002573.1
6,LT841305.1
7,CP000481.1
8,CP028302.1
9,CP028287.1


In [5]:
file_in = 'TypeIA_1.txt'
data_df_TypeIA = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIA

file_in = 'TypeIB_1.txt'
data_df_TypeIB = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIB

file_in = 'TypeIC_1.txt'
data_df_TypeIC = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIC

file_in = 'TypeID_1.txt'
data_df_TypeID = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeID

file_in = 'TypeIE_1.txt'
data_df_TypeIE = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIE

file_in = 'TypeIF_1.txt'
data_df_TypeIF = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIF

file_in = 'TypeIU_1.txt'
data_df_TypeIU = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIU

Unnamed: 0,0
0,CP023005.1
1,CP007457.1
2,AP018907.1
3,FP565575.1
4,CP009246.1
5,CP013991.1
6,CP009211.1
7,LT906467.1
8,CP035299.1
9,LT671858.1


In [6]:
file_in = 'TypeIIA_1.txt'
data_df_TypeIIA = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIA

file_in = 'TypeIIB_1.txt'
data_df_TypeIIB = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIB

file_in = 'TypeIIC_1.txt'
data_df_TypeIIC = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIC

Unnamed: 0,0
0,CP030040.1
1,FO681347.1
2,CP001392.1
3,CP038145.1
4,CP029206.1
5,CP009159.1
6,CP002449.1
7,LN554847.1
8,CP031219.1
9,CP035928.1


In [7]:
file_in = 'TypeIIIA_1.txt'
data_df_TypeIIIA = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIIA

file_in = 'TypeIIIB_1.txt'
data_df_TypeIIIB = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIIB

file_in = 'TypeIIIC_1.txt'
data_df_TypeIIIC = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIIC

file_in = 'TypeIIID_1.txt'
data_df_TypeIIID = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIID

Unnamed: 0,0
0,CP008808.1
1,CP008810.1
2,CP006999.2
3,HG004426.1
4,LT907844.1
5,CP014944.1
6,AP014800.1


In [8]:
file_in = 'TypeIV_1.txt'
data_df_TypeIV = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIV

Unnamed: 0,0
0,CP018030.1
1,CP023721.1


In [9]:
file_in = 'TypeVA_1.txt'
data_df_TypeVA = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeVA

Unnamed: 0,0
0,CP010070.1
1,AP018536.1
2,CP011376.1
3,CP011377.1
4,CP011378.1
5,CP011379.1
6,CP011280.1


In [10]:
file_in = 'TypeVIB1_1.txt'
data_df_TypeVIB1 = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeVIB1

file_in = 'TypeVIB2_1.txt'
data_df_TypeVIB2 = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeVIB2

Unnamed: 0,0
0,CM001167.1


In [11]:
data_df = data_df_TypeIA[0]
data_df_name = 'TypeIA'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [12]:
data_df = data_df_TypeIB[0]
data_df_name = 'TypeIB'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [13]:
data_df = data_df_TypeIC[0]
data_df_name = 'TypeIC'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [14]:
data_df = data_df_TypeID[0]
data_df_name = 'TypeID'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [15]:
data_df = data_df_TypeIE[0]
data_df_name = 'TypeIE'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [16]:
data_df = data_df_TypeIF[0]
data_df_name = 'TypeIF'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [17]:
data_df = data_df_TypeIIA[0]
data_df_name = 'TypeIIA'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [18]:
data_df = data_df_TypeIIB[0]
data_df_name = 'TypeIIB'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [19]:
data_df = data_df_TypeIIC[0]
data_df_name = 'TypeIIC'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [20]:
data_df = data_df_TypeIIIA[0]
data_df_name = 'TypeIIIA'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [21]:
data_df = data_df_TypeIIIB[0]
data_df_name = 'TypeIIIB'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [22]:
data_df = data_df_TypeIIIC[0]
data_df_name = 'TypeIIIC'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [23]:
data_df = data_df_TypeIIID[0]
data_df_name = 'TypeIIID'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [24]:
data_df = data_df_TypeIU[0]
data_df_name = 'TypeIU'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [25]:
data_df = data_df_TypeIV[0]
data_df_name = 'TypeIV'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [26]:
data_df = data_df_TypeVA[0]
data_df_name = 'TypeVA'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [27]:
data_df = data_df_TypeVIB1[0]
data_df_name = 'TypeVIB1'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [28]:
data_df = data_df_TypeVIB2[0]
data_df_name = 'TypeVIB2'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [29]:
data_df = data_df_Cas1[0]
data_df_name = 'Cas1'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [30]:
data_df = data_df_Cas1[0]
data_df_name = 'Cas1'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [31]:
data_df = data_df_orphan[0]
data_df_name = 'orphan'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")