In [1]:
# Copyright 2019 The Authors. All Rights Reserved.
#
# GNU General Public License v3.0
# Permissions of this strongest copyleft license are conditioned on 
# making available complete source code of licensed works and modifications, 
# which include larger works using a licensed work, under the same license. 
# Copyright and license notices must be preserved. 
# Contributors provide an express grant of patent rights. 
# When a modified version is used to provide a service over a network, 
# the complete source code of the modified version must be made available.
# ==============================================================================

# Title: Data labeling of repeats from public
# Author: Hyunjin Shim
# Date created: 20191226
# Email: jinenstar@gmail.com

# Processing PPV repeats 

- to process all repeats from PPV database (cleaning, labeling)

# Dataset description
- PPV: phage/plasmid/virus

# Step 1: Load and preprocess raw data

In [1]:
# Data
import os
import pandas as pd
import numpy as np

# Biopython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Regular expression
import re

# Plot
from pylab import *
import pylab as pylab
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
import matplotlib.patches as mpatches
%matplotlib inline
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split

# location of raw data file
datapath = '/Users/jinenstar/Desktop/Data/Progress'
os.chdir(datapath)

In [58]:
file_in = '20190903_PPV_CRISPR_repeats_edited.fa'

# with open(file_in) as f:
#     line_list = f.readlines()

# line_list

data_df = pd.read_csv(file_in, delimiter= '>', header=None)
data_df

Unnamed: 0,0,1
0,,scnpilot_expt_750_p_scaffold_1076_CRISPR_1_spa...
1,GTGACTCCAGATGCATCCTAGTGACTCCAGATG,
2,GTGACTCCAGATGCATCCTAGTGACTCCAGATG,
3,GTGACTCCAGATGCATCCTAGTGACTCCAGATG,
4,,S5_scaffold_2007_CRISPR_2_spacer_1
5,ACGCCCGGAATATATTTTAATCCTTC,
6,ATATTTGAAATATATTTTAATCCTTC,
7,ACGTCTGGAATATATTTTAATCCTTT,
8,GCATTCGGAATATATTTTAATCCTTC,
9,ACATTTGGAATATATTTTAATCCTTC,


In [59]:
data_df[1]

0        scnpilot_expt_750_p_scaffold_1076_CRISPR_1_spa...
1                                                      NaN
2                                                      NaN
3                                                      NaN
4                       S5_scaffold_2007_CRISPR_2_spacer_1
5                                                      NaN
6                                                      NaN
7                                                      NaN
8                                                      NaN
9                                                      NaN
10                                                     NaN
11                                                     NaN
12                    qh_9_scaffold_3436_CRISPR_3_spacer_1
13                                                     NaN
14                                                     NaN
15                                                     NaN
16                                                     N

In [63]:
# for i in range(len(data_df[1])):
#     if data_df[1][i] != 'NaN':
#         print(data_df[1][i])
repeat_name_list = []

for i in data_df[1]:
    #i_
    if str(i) != 'nan':
        repeat_name_list.append(i)

In [65]:
repeat_name_list[0]

'scnpilot_expt_750_p_scaffold_1076_CRISPR_1_spacer_1'

In [69]:
m = 0

with open('20190903_PPV_CRISPR_repeats_edited_processed.fa', 'w') as f:
    for n in range(len(data_df)):
        if str(data_df[0][n]) == 'nan':
            repeat_name = repeat_name_list[m]
            m += 1
        else:
             f.write(">" + repeat_name + "\n" + data_df[0][n] + "\n")


In [70]:
len(data_df) - len(repeat_name_list)

12435

# Processing CRISPR-Cas++ repeats 

- to process all repeats from classified database (matching names to sequences)

# Dataset description

- Classified: repeats classified by Type from CRISPR-Cas++

In [2]:
# Data
import os
import pandas as pd
import numpy as np

# Biopython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Regular expression
import re

# Plot
from pylab import *
import pylab as pylab
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
import matplotlib.patches as mpatches
%matplotlib inline
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split

# location of raw data file
datapath = '/Users/jinenstar/Desktop/Data/CRISPR/CRISPR-Cas++/Class'
os.chdir(datapath)

In [3]:
# use Biopython to process and parse sequences
file_in ='20190618_dr_34.fasta'

with open(file_in) as fasta_file:  # Will close handle cleanly
    all_ID_spacer = []
    all_sequence_spacer = []
    all_length_spacer = []
    all_des_spacer = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        # remove .id from .description record (remove all before first space)
        #seq_record.description=' '.join(seq_record.description.split()[1:])
        # a list of IDs and sequences
        all_des_spacer.append(seq_record.description)
        all_ID_spacer.append(seq_record.id)
        all_sequence_spacer.append(seq_record.seq)
        all_length_spacer.append(len(seq_record.seq))

# find maximum seq length
max_length_spacer = max(all_length_spacer)
min_length_spacer = min(all_length_spacer)

In [4]:
all_ID_spacer

['CP022521.1',
 'CP017986.1|CP018313.1|CP018687.1|CP018714.1',
 'CP009803.1|CM002272.1|CP029339.1',
 'LT629804.1|CP002045.1|LS483427.1',
 'CP022737.1|LN813019.1',
 'LR026975.1',
 'CP035926.1|CP034309.1|CP035928.1',
 'CP027860.1',
 'CP002952.1|CP008887.1',
 'CP006577.1',
 'CP003276.1|CP003721.1',
 'CP001779.1|CP027400.1',
 'CP003253.1',
 'LR134378.1',
 'CP020917.1|CP033730.1',
 'LR134327.1',
 'CP021983.2',
 'CP014862.1',
 'CP000859.1',
 'FN545816.1|CM000441.2|CP011846.1|CP026596.1|LN614756.1|FN668944.1|CP026597.1|CP010888.1|CP019860.1|CP020380.1|CP025044.1|FN665654.1|CP019870.1|CM000661.1|CP028530.1|CM000287.4|CP025047.1|CP026599.1|CP025046.1|CP019469.1|CP012320.1|FN668941.1|CP020379.1|CP012321.1|CP027014.1|AM180355.1|CP022524.1|CM000637.1|CP011847.1|CP025045.1|FN665652.1|CP016106.1|CP026598.1|CP016318.1|CP010905.2|CP011968.1|CP016104.1|CP019858.1|CM000657.1|CP020378.1|CP028527.1|CM000658.1|ABHD02000050.1|CP028523.1|CP028526.1|CP026593.1|FN538970.1|CM000659.1|CP013196.1|CP028528.1|CP011

In [5]:
file_in = 'Cas1.txt'

data_df_Cas1 = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_Cas1

file_in = 'orphanCRISPR.txt'
data_df_orphan = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_orphan

Unnamed: 0,0
0,CP000828.1
1,AP018515.1
2,CP022699.1
3,CP017448.1
4,CP026328.1
5,CP005986.1
6,CP002573.1
7,LT841305.1
8,CP000481.1
9,CP028302.1


In [104]:
file_in = 'TypeIA_1.txt'
data_df_TypeIA = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIA

file_in = 'TypeIB_1.txt'
data_df_TypeIB = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIB

file_in = 'TypeIC_1.txt'
data_df_TypeIC = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIC

file_in = 'TypeID_1.txt'
data_df_TypeID = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeID

file_in = 'TypeIE_1.txt'
data_df_TypeIE = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIE

file_in = 'TypeIF_1.txt'
data_df_TypeIF = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIF

file_in = 'TypeIU_1.txt'
data_df_TypeIU = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIU

Unnamed: 0,0
0,CP023005.1
1,CP007457.1
2,AP018907.1
3,FP565575.1
4,CP009246.1
5,CP013991.1
6,CP009211.1
7,LT906467.1
8,CP035299.1
9,LT671858.1


In [105]:
file_in = 'TypeIIA_1.txt'
data_df_TypeIIA = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIA

file_in = 'TypeIIB_1.txt'
data_df_TypeIIB = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIB

file_in = 'TypeIIC_1.txt'
data_df_TypeIIC = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIC

Unnamed: 0,0
0,CP030040.1
1,FO681347.1
2,CP001392.1
3,CP038145.1
4,CP029206.1
5,CP009159.1
6,CP002449.1
7,LN554847.1
8,CP031219.1
9,CP035928.1


In [106]:
file_in = 'TypeIIIA_1.txt'
data_df_TypeIIIA = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIIA

file_in = 'TypeIIIB_1.txt'
data_df_TypeIIIB = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIIB

file_in = 'TypeIIIC_1.txt'
data_df_TypeIIIC = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIIC

file_in = 'TypeIIID_1.txt'
data_df_TypeIIID = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIIID

Unnamed: 0,0
0,CP008808.1
1,CP008810.1
2,CP006999.2
3,HG004426.1
4,LT907844.1
5,CP014944.1
6,AP014800.1


In [107]:
file_in = 'TypeIV_1.txt'
data_df_TypeIV = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeIV

Unnamed: 0,0
0,CP018030.1
1,CP023721.1


In [108]:
file_in = 'TypeVA_1.txt'
data_df_TypeVA = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeVA

Unnamed: 0,0
0,CP010070.1
1,AP018536.1
2,CP011376.1
3,CP011377.1
4,CP011378.1
5,CP011379.1
6,CP011280.1


In [109]:
file_in = 'TypeVIB1_1.txt'
data_df_TypeVIB1 = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeVIB1

file_in = 'TypeVIB2_1.txt'
data_df_TypeVIB2 = pd.read_csv(file_in, delimiter= '\n', header=None)
data_df_TypeVIB2

Unnamed: 0,0
0,CM001167.1


In [110]:
data_df = data_df_TypeIA[0]
data_df_name = 'TypeIA'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [111]:
data_df = data_df_TypeIB[0]
data_df_name = 'TypeIB'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [112]:
data_df = data_df_TypeIC[0]
data_df_name = 'TypeIC'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [113]:
data_df = data_df_TypeID[0]
data_df_name = 'TypeID'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [114]:
data_df = data_df_TypeIE[0]
data_df_name = 'TypeIE'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [115]:
data_df = data_df_TypeIF[0]
data_df_name = 'TypeIF'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [84]:
data_df = data_df_TypeIIA[0]
data_df_name = 'TypeIIA'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [85]:
data_df = data_df_TypeIIB[0]
data_df_name = 'TypeIIB'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [86]:
data_df = data_df_TypeIIC[0]
data_df_name = 'TypeIIC'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [87]:
data_df = data_df_TypeIIIA[0]
data_df_name = 'TypeIIIA'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [88]:
data_df = data_df_TypeIIIB[0]
data_df_name = 'TypeIIIB'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [89]:
data_df = data_df_TypeIIIC[0]
data_df_name = 'TypeIIIC'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [90]:
data_df = data_df_TypeIIID[0]
data_df_name = 'TypeIIID'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [94]:
data_df = data_df_TypeIU[0]
data_df_name = 'TypeIU'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [95]:
data_df = data_df_TypeIV[0]
data_df_name = 'TypeIV'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [96]:
data_df = data_df_TypeVA[0]
data_df_name = 'TypeVA'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [97]:
data_df = data_df_TypeVIB1[0]
data_df_name = 'TypeVIB1'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [98]:
data_df = data_df_TypeVIB2[0]
data_df_name = 'TypeVIB2'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [99]:
data_df = data_df_Cas1[0]
data_df_name = 'Cas1'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [None]:
data_df = data_df_Cas1[0]
data_df_name = 'Cas1'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")

In [6]:
data_df = data_df_orphan[0]
data_df_name = 'orphan'
repeat_ID_list = []
repeat_sequence_list = []

for i in data_df:
    for n in range(len(all_ID_spacer)):
        if (all_ID_spacer[n].find(i) != -1):
            repeat_ID_list.append(i)
            repeat_sequence_list.append(all_sequence_spacer[n])

file_w = data_df_name + '.fa'

with open(file_w, 'w') as f:
    for n in range(len(repeat_ID_list)):
        f.write(">" + repeat_ID_list[n] + "\n" + str(repeat_sequence_list[n]) + "\n")