In [21]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

In [6]:
def parse_fasta(filepath):
    seqs = {}
    with open(filepath) as fasta_file:  # Will close handle cleanly
        identifiers = []
        lengths = []
        for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
            #print(seq_record)
            #break
            identifiers.append(seq_record.id)
            lengths.append(len(seq_record.seq))
            seqs[seq_record.id]=str(seq_record.seq)
    return seqs

In [9]:
gfp_base = "PreProcessingMetaData_data_GFP.fa"
cd46_base = "PreProcessingMetaData_data_CD46_ENST00000367042.1.fa"
cd55_base = "PreProcessingMetaData_data_CD55_ENST00000367064.3.fa"
cd71_base = "PreProcessingMetaData_data_CD71_ENST00000360110.4.fa"

gfp_chunks = 'cas13-master-TilingScreens-data/TilingScreens/data/PreProcessingMetaData_data_Cas13d_GFP_library.final.fa'
cd46_chunks = 'cas13-master-TilingScreens-data/TilingScreens/data/CD46_library_final.fa'
cd55_chunks = 'cas13-master-TilingScreens-data/TilingScreens/data/CD55_library_final.fa'
cd71_chunks = 'cas13-master-TilingScreens-data/TilingScreens/data/CD71_library_final.fa'

In [34]:
list(parse_fasta(gfp_base).values())[0]

'GTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAG'

In [12]:
parse_fasta(gfp_chunks)

{'rc_000001': 'GTATCCCATAGTCCTTAAATTGGGTCG',
 'rc_000002': 'CCTGTGCTTCCTTTGATCTGGGGCATG',
 'rc_000003': 'GACGATATATACCAAGCGAACGTCTGA',
 'rc_000004': 'ACATGGCAAATTTGCCGGTGTCAAACC',
 'rc_000005': 'TATGGTGAATCGTACATACTGTCGCTG',
 'rc_000006': 'ATAGGACAACTGTCTTTCCCAATATAC',
 'rc_000007': 'CCTGTCCTCCACCGTGAAGGCGGCTTG',
 'rc_000008': 'TTTAACGTTATCTGCAGGCGCAGGGGA',
 'rc_000009': 'TGTCAAACAATCTACACAGTAAGTCGC',
 'rc_000010': 'TGATCCCCGGCACTAAGGTGCTGATTC',
 'rc_000011': 'AGCGTAGATCAACCAGCAAGTGCAGCG',
 'rc_000012': 'TGATGCAGGGATGCGTCGGTGAATCCT',
 'rc_000013': 'GAATGCCAACGGGAATCAGCTCCGACT',
 'rc_000014': 'GGGATAACAGGGTCGATGCCGCCCTTA',
 'rc_000016': 'GAGGACGGAATGTATCTACACCATTGG',
 'rc_000017': 'GTATCCTAGAATATGTTCCTCCCAGGT',
 'rc_000018': 'CACACTACCAACGACGCACACCGTTTA',
 'rc_000019': 'GTTCCGACGGATACCGGTGTCTACCGC',
 'rc_000020': 'AATGGTCAGTCCCGTTGATAGACTGTG',
 'rc_000021': 'ACGACTGTCGCACACGCCGTTTATCTC',
 'rc_000022': 'TACTGTTTCAACGAATCACTGCACAAC',
 'rc_000023': 'ATATTACAATAAACGATGGATCGTTTC',
 'rc_00002

In [None]:
parse_fasta("PreProcessingMetaData_data_GFP.fa")

In [65]:
def convert_base(base, chunks, window_sizes = [0,1,5,10,20,40,100]):
    output = pd.DataFrame(columns = (["name"] + ["window_{}".format(size) for size in window_sizes]))
    
    base_string = list(base.values())[0]
    for key in chunks:
        if "_FirstOrder_" in key:
            start_pos = int(key.split("_FirstOrder_")[0].split(":")[1].split("-")[0])
            end_pos = int(key.split("_FirstOrder_")[0].split(":")[1].split("-")[1])
            #print(start_pos, end_pos)
            #print(Seq(chunks[key]).reverse_complement())
            mod_string = base_string[:start_pos-1] + Seq(chunks[key]).reverse_complement() + \
                            base_string[end_pos:]
            #print(base_string)
            #print(len(base_string))
            dic = {"name": key}
            for size in window_sizes:
                start = max(0, start_pos-1-size)
                end = min(len(mod_string), end_pos+size)
                dic["window_{}".format(size)] = str(mod_string[start:end].reverse_complement().transcribe())
            #print(dic)
            output = output.append(dic, ignore_index=True)
            #break
            #chunks[key]
    return output

In [66]:
data = convert_base(parse_fasta(gfp_base), parse_fasta(gfp_chunks))

In [67]:
data.head()

Unnamed: 0,name,window_0,window_1,window_5,window_10,window_20,window_40,window_100
0,crRNA004:7-33_FirstOrder_01,AACCCCGGUGAACAGCUCCUCGCCCUU,CAACCCCGGUGAACAGCUCCUCGCCCUUG,GGCACAACCCCGGUGAACAGCUCCUCGCCCUUGCUCA,GGAUGGGCACAACCCCGGUGAACAGCUCCUCGCCCUUGCUCAC,AGCUCGACCAGGAUGGGCACAACCCCGGUGAACAGCUCCUCGCCCU...,GGCCGUUUACGUCGCCGUCCAGCUCGACCAGGAUGGGCACAACCCC...,UCAGGGUCAGCUUGCCGUAGGUGGCAUCGCCCUCGCCCUCGCCGGA...
1,crRNA004:7-33_FirstOrder_02,CGCCCCGGUGAACAGCUCCUCGCCCUU,CCGCCCCGGUGAACAGCUCCUCGCCCUUG,GGCACCGCCCCGGUGAACAGCUCCUCGCCCUUGCUCA,GGAUGGGCACCGCCCCGGUGAACAGCUCCUCGCCCUUGCUCAC,AGCUCGACCAGGAUGGGCACCGCCCCGGUGAACAGCUCCUCGCCCU...,GGCCGUUUACGUCGCCGUCCAGCUCGACCAGGAUGGGCACCGCCCC...,UCAGGGUCAGCUUGCCGUAGGUGGCAUCGCCCUCGCCCUCGCCGGA...
2,crRNA004:7-33_FirstOrder_03,CAACCCGGUGAACAGCUCCUCGCCCUU,CCAACCCGGUGAACAGCUCCUCGCCCUUG,GGCACCAACCCGGUGAACAGCUCCUCGCCCUUGCUCA,GGAUGGGCACCAACCCGGUGAACAGCUCCUCGCCCUUGCUCAC,AGCUCGACCAGGAUGGGCACCAACCCGGUGAACAGCUCCUCGCCCU...,GGCCGUUUACGUCGCCGUCCAGCUCGACCAGGAUGGGCACCAACCC...,UCAGGGUCAGCUUGCCGUAGGUGGCAUCGCCCUCGCCCUCGCCGGA...
3,crRNA004:7-33_FirstOrder_04,CACACCGGUGAACAGCUCCUCGCCCUU,CCACACCGGUGAACAGCUCCUCGCCCUUG,GGCACCACACCGGUGAACAGCUCCUCGCCCUUGCUCA,GGAUGGGCACCACACCGGUGAACAGCUCCUCGCCCUUGCUCAC,AGCUCGACCAGGAUGGGCACCACACCGGUGAACAGCUCCUCGCCCU...,GGCCGUUUACGUCGCCGUCCAGCUCGACCAGGAUGGGCACCACACC...,UCAGGGUCAGCUUGCCGUAGGUGGCAUCGCCCUCGCCCUCGCCGGA...
4,crRNA004:7-33_FirstOrder_05,CACCACGGUGAACAGCUCCUCGCCCUU,CCACCACGGUGAACAGCUCCUCGCCCUUG,GGCACCACCACGGUGAACAGCUCCUCGCCCUUGCUCA,GGAUGGGCACCACCACGGUGAACAGCUCCUCGCCCUUGCUCAC,AGCUCGACCAGGAUGGGCACCACCACGGUGAACAGCUCCUCGCCCU...,GGCCGUUUACGUCGCCGUCCAGCUCGACCAGGAUGGGCACCACCAC...,UCAGGGUCAGCUUGCCGUAGGUGGCAUCGCCCUCGCCCUCGCCGGA...


In [68]:
data.to_csv("gfp_windows.csv")

In [None]:
GTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTTGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAG
GTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAG

In [37]:
window_sizes = [0,1,5,10,20,40,100]

In [38]:
["window_{}".format(size) for size in window_sizes]

['window_0',
 'window_1',
 'window_5',
 'window_10',
 'window_20',
 'window_40',
 'window_100']

In [42]:
pd.DataFrame(columns = (["name"] + ["window_{}".format(size) for size in window_sizes]))

Unnamed: 0,name,window_0,window_1,window_5,window_10,window_20,window_40,window_100
