In [1]:
import pandas as pd
import numpy as np
import scipy
import scipy.sparse
import scipy.stats
import os
import scipy.io as sio
import regex as re
from collections import Counter, defaultdict
import sys
import gzip

def distance(astring, bstring) :
    distance = 0
    
    limit = len(astring)
    diff = len(bstring) - len(astring)
    if len(bstring) < len(astring) :
        limit = len(bstring)
        diff = len(astring) - len(bstring)
    
    for i in range(limit) :
        if astring[i] != bstring[i] :
            distance += 1
    return distance + diff

In [3]:
r1_rna = 'Undetermined_S0_R1_001.fastq.gz'
#r2_rna = 'Undetermined_S0_R2_001.fastq.gz'
r_indx = 'Undetermined_S0_I1_001.fastq.gz'

proximal_regex = re.compile(r"(AAAAAAAAAAAAAAAAAAAA){s<=3}")
proximal_regex_prefix = re.compile(r"(AAA)(AAAAAAAAAAAAAAAAA){s<=3}")

wildtype_downstream_regex = re.compile(r"(GATGTCTCGTGATCTGGTGT){s<=2}")

upstream_regex = re.compile(r"(CAATTCTGCT){s<=2}[ACGTN]{40}(CTAAAATATA){s<=2}")
downstream_regex = re.compile(r"(AGTATGAAAC){s<=2}[ACGTN]{20}(ACCCTTATCC){s<=2}")
seq_regex = re.compile(r"(CAATTCTGCT){s<=2}[ACGTN]{40}(CTAAAATATA){s<=2}.*(AGTATGAAAC){s<=2}[ACGTN]{20}(ACCCTTATCC){s<=2}")

In [4]:
f1 = gzip.open(r1_rna,'rt')
i1 = gzip.open(r_indx, 'rt')

#f2 = open(r2_rna,'r')

head, seq, pr, q, head2, seq2, pr2, q2, headi, seqi, pri, qi = ({} for i in range(12))

count = 0

total_proximal_rna_count = 0
num_upstream_region_extractions = 0
num_downstream_region_extractions = 0

print('Processing RNA reads.')

out = open('tomm5_rna_polyatail_3errors_test1.csv','w')
out.write('upstream_seq,downstream_seq,seq,umi,polya,polya_prefixed,is_proximal\n')

while True:
    head = f1.readline()[:-1]
    seq = f1.readline()[:-1]
    pr = f1.readline()[:-1]
    q = f1.readline()[:-1]
        
    headi = i1.readline()[:-1]
    seqi = i1.readline()[:-1]
    pri = i1.readline()[:-1]
    qi = i1.readline()[:-1]
    
    if len(q) == 0:
        break # End of File
    
    upstream_flank = re.search(upstream_regex, seq)
    downstream_flank = re.search(downstream_regex, seq[70:220])
    both_flank = re.search(seq_regex, seq)

    if upstream_flank is not None:
        num_upstream_region_extractions += 1
        upstream_flank_seq = upstream_flank.group()
        
        proximal_test_outcome = re.search(proximal_regex, seq)
        
        umi = seqi
        
        polya_pos = -1
        polya_pos_prefixed = -1
        downstream_flank_seq = ''
        is_prox = 0
        if downstream_flank is not None :
            num_downstream_region_extractions += 1
            downstream_flank_seq = downstream_flank.group()
        elif proximal_test_outcome is not None :
            total_proximal_rna_count += 1
            polya_pos = proximal_test_outcome.start()
            is_prox = 1
            
            prefixed_test_outcome = re.search(proximal_regex_prefix, seq)
            if prefixed_test_outcome is not None :
                polya_pos_prefixed = prefixed_test_outcome.start()
        
        both_flank_seq = ''
        if both_flank is not None :
            both_flank_seq = both_flank.group()
        
        out.write(upstream_flank_seq)
        out.write(',' + downstream_flank_seq)
        out.write(',' + both_flank_seq)
        out.write(',' + umi)
        out.write(',' + str(polya_pos))
        out.write(',' + str(polya_pos_prefixed))
        out.write(',' + str(is_prox))
        out.write('\n')
        
    if count % 1000000 == 0:
        print('Count: ' + str(count))
        print('Number of upstream regions extracted: ' + str(num_upstream_region_extractions))
        print('Number of downstream regions extracted: ' + str(num_downstream_region_extractions))
        print(str(total_proximal_rna_count) + ' proximal RNA reads')
    count += 1
    
print('COMPLETE')
print('Number of upstream regions extracted: ' + str(num_upstream_region_extractions))
print('Number of downstream regions extracted: ' + str(num_downstream_region_extractions))
print(str(total_proximal_rna_count) + ' proximal RNA reads')

out.close()

f1.close()
#f2.close()
i1.close()


Processing RNA reads.
Count: 0
Number of upstream regions extracted: 0
Number of downstream regions extracted: 0
0 proximal RNA reads
TTTGCNTGTTTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTGATTCATGCCCTATTGGCGTCTAAAATATAAAACTATTTGGGAAGTATGCAAAAAAAAAAAAAAAAAAAAAAAGGAAGGGGGCCGG
TGATTNAGTATGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTATAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTCCCTAGGCAGCGCAATGTGACTAAAATATAAAACTATTTGGGAAGTATGTAAAAAAAAAAAAAAAAAAAGTAGGCCGCGGGGCGGG
TTGGCNGTAATTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTGTTGGTTCTAAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTACGAAAAAAAAAAAAAAAAAAAAAAAAACGGGGGGGGCGG
AACTGNTCTCCAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCATTCTGCTTGTTAAGAACAAGTTTGGCTAACTAGTCATCACCTTGCGATTAAAATATAAAACTATTTGGGAAGTATGCAAAAAAAAAAAAAAAAAAAA

AAATGTAAGGAGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTAACTGAGCCAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTACGAAAAAAAAAAAAAAAAAAAAAGAAGGGACGGGCCCGG
AACCGGCTCAAGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTCTCATCAGAGGAGTATGTTTCTAAAATATAAAACTATTTGGGAAGCAAAAAAAAAAAAAAAAAAAAAAAAAGGACGGGGGGCCGGG
TAAACCTTTGTAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTAGATATGCACAACTGACCTTCAAAGCTAAAAAAAAAAAAAAAAAAAAAAACGAAAGACCCCGGGGCGGCAGAGCAGGAACCCGGGCGCCGG
TCTCTGCGCTTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTCATTATACCTGTCTTCAGTCCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGGAAAAAAAAAAAAAAAAAAAAGATCGGAGGGGCCCGG
TTATGTCCGTTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

ATTTGATGGGGCGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTGTTACTCCAATCATGGTGAGCTAAAATATAAAACTATTTGGGAAACAAAAAAAAAAAAAAAAAAAAAAAAGGAAGCGGGGGCGGGG
TCGGAATAGGTAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTAGGTCTGTCGCTTATGAGGGCTAAAATATAAAACTATTTGGGAAGTATCAAAAAAAAAAAAAAAAAAAAAGAAGGGCTGGGGGCGG
TTTTTTTCTGGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTTTGAGTCACACTGTTTTGTCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTCTTCGGGAAGAAAAAAAAAAAAAAAAAAAAAAAAAACTAAGCGGCGGCCTG
GTTGAGGTTGTAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCTATTCTGCTTGTTAAGAACAAGTTTGGCTGTTGAACCGTTCCGCATCATCTAAAATATAAAACTATTTGGGAAAAAAAAAAAAAAAAAAAATGAACGGAGGGGCCCGGGGGGGGG
CAATCGTTCTTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTGCAGGGACTGATGGTGAGGCCCATTGCCTGCGGCCGCAATTCTGCTTG

TCAAATTAACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTGTAGAGTTGACCCAACGTGCCTAAAATATAAAACTATTTGGGAAGTATGAAAAAAAAAAAAAAAAAAAAAAAAGGACGGGGGCCCG
TTAGTAGTGTTCGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCACAATTCTGCTCACTTATTGACGAATCTCATCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAATAAGAAAAAAAAAAAAAAAAAAAAAATCGAACGGTCGGCCGG
CCCCGTGGATTAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTTTGAATAGTTGTGCGAGCACTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGGAAAAAAAAAAAAAAAAAAAAGAAACGCGGGGGCCGG
CTGATTGGCAGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTAATAGATTTGCTGTATTTCGCTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAAAAAAACGGGGGGGGCGG
CGCCACAAACTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTAA

TTCTGTGTTTCTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTGTATATTCCGATAGGGACCACTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAAAGAACGGACGGGGCCCGG
TATATCAAAGATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTGGGGTCAACCTTGTGATCCACTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTACAAAAAAAAAAAAAAAAAAAAAAAAAAGCCGGGCGCCGG
CGTAAGGCATAAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTATGCATTAATTATGCATGTTCTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAGATAGAGCCGGGGGCCGG
ATTGCCTTTATAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTGTACGTTGCCGTTTAAAGCACTAAAATATAAAACTATTTGGGAAGTACGAAAAAAAAAAAAAAAAAAAAAATCGGAAGAGGGCACG
AATTTTATCCTGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGTCTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

TACTATTTTTCCGGTCCTGCTGGAGTTAGTGACCGCCGCCGGGATCACTCTAGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTCTAGCTCGCGTGTGCTCATCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAAGAAGA
TGACGGAGACGGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTTGGAGCTATGTAAGGGTGTTCTAAAATATAAAACTATTTGGGAAGTACGAAAAAAAAAAAAAAAAAAAAAAAAACCACGGGCCCGG
ATGTGAAAATGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTCGCCAACTATCTCTTGTTACCTAAAATATAAAACTATTTGGGAAGTACAAAAAAAAAAAAAAAAAAAAAAATAAGAAGGGGCCCGG
CTCTTCCTTCACGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTAATGAGAGAAATTAGCTTTTCTAAAATATAAAACTATTTGGGAAGTACGAAAAAAAAAAAAAAAAAAAAAATAGGAAGGAGCCGGC
TGCAATGGTCTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCATTCTGCTTGT

CACAATTTTTGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTATTTACTGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAATATCAAAAAAAAAAAAAAAAAAAGAAAGGACGCGGGGGCGGG
TATGTAAATTTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTTATTTCGTCTAAAATTATTTCTAAAATATAAAACTATTTGGGAAGTACAAAAAAAAAAAAAAAAAAAAAATAGGAGGGGGGCCCGG
TAATAGTGCTCGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTGGTAGAGCGCAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTAGAAAAAAAAAAAAAAAAAAAAAAAAAGCCCTGGGCCCGG
CTATAACCGTTAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGAAAGGCCCATTACCTGCGGCCTCAATTCTGCTTGTTAAGAACAAGTTTGGCTCTTAATGCGTAACTTTGACACTAAAATATAAAACTATTTGGGAAGTACGAAAAAAAAAAAAAAAAAAAAAAAAGGAAGGGGCCCGG
TATCACGCATTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

ATTATAACGCTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCACAATTCTGCTTGTTAAGAACAAGTTTGGCTGATGTGCGTACAGAGTTGCACTAAAATATAAAACTATTTGGGAAGTAAAAAAAAAAAAAAAAAAAAAGAAGGGACGCGGGGCCGGG
CTTTTGAGAATTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCACAATTCTGCTCAACATGTGTGATAATACGACTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAATAAAAAAAAAAAAAAAAAAAAATCGGAAACGGGGGGGGGGG
TTAGTTACGATCGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCGATTACCTGCGGCCGCAATTCTGCTCGTGTGAAATCACATTTTGTCTGGTATCTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTAAAAAAAAAAAAAAAAAAAAAGATAGAAGGCGGGGGGGGG
GCTCGTATATCTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTCATTTTCAAGCGACGTCGAGCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTACAAAAAAAAAAAAAAAAAAAAAAACGGAAGGGGGCCCGG
ATAGAATTTTTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

TTATTCAAGAAGGGTCCTGCTGGAGTTAGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTAACTTTGGACGGTCTGGGCTCTAAAATATAAAACTATTTGGGAAGTATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
ATATAGTACTCAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTGAATAAGGGAGTGCTGTGGCCTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAAAAACGGGCTCGGGCGGG
CTGTCAGAGGCAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTATACATCCCGAGGCGTCCGCCTAAAATATAAAACTATGTGGGAAGTATTAAAAAAAAAAAAAAAAAAAAAGAAGCGGGGGGGCCGG
CGGTATTATTGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCACAATTCTGCTTGTTAAGAACAAGTTTGGCTTTGATAATTGTGATGTAAAGCTAAAATATAAAACTATTTGGGAAGTATGGAAAAAAAAAAAGAAAAAAAGGTGGGGGGGGGGGGGG
CGGATGGGCTGCGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

TGGTAGTTAGTGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCACAATTCTGCTTGTTAAGAACAAGTTATACCTTGGCAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAACTAAAAAAAAAAAAAAAAAAAAAAAAGGAACGCGGGGCCGGG
AGGTGGGTCACGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCACAATTCTGCTTGTTAAGAACAAGTTTGGCTCCGTGAAACATAAAGCAGCACTAAAATATAAAACTATTTTGGGAAGTCAAAAAAAAAAAAAAAAAAAAATAAAGCCGGGGGGGCGG
TGGAAAAATAGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTTAAATGTTGCGTATAGCAGCCTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAAGAACAGACGCGGCCGCG
TTCGGTCCTAATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTCTCTTAGGGTATGGGTCTCGCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGAAAAAAAAAAAAAAAAAAAAAAAAACGGGGGGCGCGG
CCTCTGTATTGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

GTCCAATGCTGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTCAATTTGTTTGTACCCAGCCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTACAAAAAAAAAAATAAAAAAAAGAAAGACACGGGGCCCGG
GATTTATCCTGAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTCCTATGTAGACCAGGGGGATCTAAAATATAAAACTATTTGGGAAGTATGAAAACAAAAAAAAAAAAAAAAAAAAACCAGGGGACGG
ACGTTATGTGCTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTGGTTCGCTTCAGCGTATTACCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGACAAAAAAAAAAAAAAAAAAAAAAAAACGGGGGCCGG
ATTTCTTGCAAAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTCGTACTTTGTAACTGCCCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGTAAAAAAAAAAAAAAAAAAAAAGAAGGCGTGTGCCGG
ATCTTGGATTGGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

GATAATTGCGCCGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTCATTATTAAGAACTGACCTCCAAAGCTAAAATATAAAACTATTTGGGAAGTATGCAAAAAAAAAAAAAAAAAAAAAAGAGCGGGGGGGGGG
TTCAATTGCGAGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTTCCATATCAAACTGACCTTCAAAGTTAAAATATAAAACTATTTGGGAACTAAAAAAAAAAAAAAAAAAAAAAAAGGAACCGGGGGGGGGG
ATATGAATTCGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCACAATTCTGCTTGTTAAGAACAAGTTTGGCTTGTTCATTCAAAGCGGGAGACTAAAATAAAACTATTTGGGAAGTATGGAAAAAAAAAAAAAAAAAAAAAAAAGGGAGGGGGGGGGG
GATTTTGTGAAGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCCGCAATTCTGCTTTGTTCGGTTCATCCGTGGCCTGGTAACTGACCTTCAAAGCTAAAATATACAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAAATCGGAAGCCGCGGGGGCGG
AATCACTTATGGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

ATCAAAGGCTCGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTCGGGTGTAGTGAATCTCAGTCTGGTAACTGACCTTCAAAGCTAAAAAAAAAAAAAAAAAAAAAGATAGGACGCGGGGGCGGGGGGGGGGGGGGGGCGGGGGGCGGG
ATTATCTCTGACGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTCGAATCTGCTAATGAAAGGTCTAAAATATAAAACTATTTGGGAAGTAAAAAAAAAAAAAAAAAAAAATAAGGAAACGGGGGCGCGG
TGTCTCGTTGTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTACGTTATATTGTCCAAAGCTCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTTAAAAAAAAAAAAAAAAAAAAAAGAGGGGCGTGGGCCGGG
AATACATTTCTAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTTGCTCGGGGAACTGACCTTCAAAGCTAAGATATAAAACTATTTGGGAAGTATGAAAAAAAAAAAAAAAAAAAAAAAAAAACTAATAAAGA
TGTGTACAGGTGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

ATGCAATCTGATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTCGTAGTAGATGCTTAAATTACTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAAAAAAGGACGGGGCCCGG
TACCTAGGATATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGCACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTCGAGTACCTAATAGCAAGTGCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGCAAAAAAAAAAAAAAAAAAAAAAAACGGACGCGGGGCCCGG
TCACCCGCTAGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTGATGTTTCTTTTTCTGTGTTCTAAAATATAAAACTATTTGGGAAGTACGAAAAAAAAAAAAAAAAAAAAAAAAAGGAGGGGCCCGG
AATTGTCTAGTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTCATGGATCGTAACTGACCTTCAAAGCTAAAATAAAAACTATTTGGGAAGTATGAAAAAAAAAAAAAAAAAAAGAACGGAACCGGGGCCGGG
ACTTAAGAAATGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

ACTTAGCGACCTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTGTCAAGCTAGAACTGACCTTCAAAGCTAAAATAAAAACTATTTGGGAAGTACAAAAAAAAAAAAAAAAAAAAAAACGGACGCGGGGGGGGG
TCTGAGATCATTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTATCGAGAAGCAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTAATAAAAAAAAAAAAAAAAAAAAAAGAACGGGTGGCGCGG
TTAATTCTACTTGGTCCTGCTGGAGTTAGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCACAATTCTGCTTGTTAAGAACAAGTTTGGCTAGTCGTGTGATACATTTTACCTAAAATATAAAACTATTTGGGAAGTATCAAAAAAAAACAAATAAAAAAAGAAGGGCGGGGGCGGG
ACGTGATGTCTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGACTGATCTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGAAAAAACAAAAAAAAAAAAAAAAAAAAAGGTGGGCGG
ACCTCAAGTAAGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTC

AAGATGGGAGATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTGTGTAAACAACAACATTGGGCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTAGAAAAAAAAAAAAAAAAAAAAAAAAAACGAGGGGGCCCG
GAGTGTGGTAAGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTCTAAGCATGGATAGCAATGCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAGAAAAGGCCGGGGCGCGG
ATGTTTGTGTTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTGTGAAAAACCGTTCGTTTCGCTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAAATAGAAACGGGGCCCGG
ATCTTTGGGATCGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCATTCTGCTCGGTTGAAGCAGTACGTTCACTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTAACAAAAAAAAAAAAAAAAAAAAAAAAAACCCGGGGCCCGG
ATATATTGTAGTGGTCCTGCTGGAGTTCGTGACTGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

TATTTTCACTGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTCAATTGTTGCAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAGAAAGGAAGCGGGGGCGGG
TTGCCGTTTAGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTTCTTACCCCTTTATAAAGCACTAAAATATAAAACTATTTGGGAAGTATGAAAAAAAAAAAAAAAAAAAAAAAAAGCAATGTGCCGG
AGATTAAACCATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTAAGTAAAAGTAACTGACCTTCAAAACTAAAAAAAAAAAAAAAAAAAAAAATAGGAAGCGGCGCCGGGGGAAGGGGAGGGCCGGTGGGCGGG
TCGGAGGGGTTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTTATGATGGCCGATTACTTTCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTACAAAAAAAAAAAAAAAAAAAAAAAAGACAAGGGGGCCGG
AGATAATTTGCGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

TTGGTCCGCATAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTATGAACCTAAGTCGCGGTCCCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTAAAAAAAAAAAAAAAAAAAAAAAATCGACGGGGGGGCGGG
CCAAACGAATTCGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTCACAATTATATCAGTCTTTTCTAAAATATAAAACTATTTGGGAAGTGAAAAAAAAAAAAAAAAAAAAAGAACGACGCGGGGGCCGG
TGCTCCAGTTTGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTCCGGGTTAAAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGAAACAAAAAAAAAAAAAAAAAAAAAATCGTAGGCCGG
TACTCTTATGAAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGGGTTGGGTTGTTTAGAAACTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTACAAAAAAAAAAAAAAAAAAAAAAAAAAGGAGGGGCGCGG
TTCTCTGCCGATGGTCCTGCTGGAGTTAGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

TCTAATATGAGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTCTACGTTTCGGTGATGTATTCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAAAAAAAAAAAAAAAAAAAAGAACGGAAGAGCCGGGGGGGGGG
CATTAGTCAATGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTCAGTGAGAAAGATGTGGTAGCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTACGAAAAAAAAAAAAAAAAAAAAAGGGGGGCGGGGGCCGG
TCATACGTTGGGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGACCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTTCATCCGGGTTCGGGATTGACTAAAATATAAAACTATTTGGGAAGTATGAAAAAGAAAAAAAAAAAAAAAAAAAAACGGGGGCCGG
CCTCTTGACTCAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTAAGTGGTCGGTGGAGCCTACTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGTGAAAAAAAAAAAAAAAAAAAGAACGGAAGAGAACCCGGGGGCGGG
TTAATTGGTCAAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTG

AATCATAGATAAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTGTAGTCACCTGTGAATGGTTCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGCAAAAAAAAAAAAAAAAAAAAAAAAAGGAAGCGGGGGCGGG
CTTTAATAACAGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTATCTCTTGGTTGTTGTTACCCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTCAAAAAAAAAAAAAAAAAAAAAAATAGGACGCGGGCCGGG
TTTTCGTGTTCTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGCTCAGCAAAAATTCGAGGTGATCTAAAATATAAAACTATTTGGGAAGTATGGAAAAAAAAAAAAAAAAAAAAAGTACGGGGGGGGGGG
CCTTTGTGCTTTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTCTGTGGGTGCAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGAAAAAAAAAAAAAAAAAAAAAAAGCCAACGGCCCCGG
CTGTAAGTAAGAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCTCAATTCTGCTTG

TGTCGGCGTCCGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGGGTGCTCCATTCAACGCTAGCGCCTAAAATATAAAACTATTTGGGAAGTATTAAAAAAAAAAAAAAAAAAAAAAGGAGGGGGGGGGGGG
CCATGAGTTCCGGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTATGTTAAACTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATAAAAAAAAAAAAAAAAAAAAGAAGGGACGGGGGGCGGG
AACTCGAAGTTAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTGCTCTAGGTTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGGAAAAAAAAAAAAAAAAAAAAAGAAGGCGGGGGGCGG
CAATATTTGTGTGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTGCTTGGTGTCAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTAAAAAAAAAAAAAAAAAAAAATTGGGAGCGGCGGGGCGGG
TAAAAATTGATCGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCACAATTCTGCTTG

ATGTTGGCAGACGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTATCGACTTAAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGAAACGAAAATAAAAAAAAAAAAAAAAAAAAGGGTCGG
AAAAATTCTTATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCGCAATTCTGCTTGTTAAGAACAAGTTTGCTATTTTAAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTAGAAAAAAAAAAAAAAAAAAAAAAAGGGCACGGGGGCGGG
TTTCGATCCTGAGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGACTGATAGTAAGGCCCATTACCTGCGGCCACAATTCTGCTTGTTAAGAACAAGTTTGGCTACATGGCAGCTCGTCTTAACCTAAAATATAAAACTATTTGGGAAGTATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGGAGCGG


KeyboardInterrupt: 

In [4]:
proximal_regex = re.compile(r"(AAAAAAAAAAAAAAAAAAAA){s<=3}")

test_re = re.search(proximal_regex, 'TTTAAGTTTTTTTGATAGTAAGGCCCATTACCTGAGGCCGCAATTCTGCTTGTTAAGAACAATCCCAGTTCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGAAAAAAAAAAAAAAAAAAAAACCGGTTTCCGGATGGGGAGGGCGCCCGGGGGGGGGGCGGGCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG')

print(test_re.start())

115


In [7]:
proximal_regex = re.compile(r"(AAA)(AAAAAAAAAAAAAAAAA){s<=3}")

test_re = re.search(proximal_regex, 'TTTAAGTTTTTTTGATAGTAAGGCCCATTACCTGAGGCCGCAATTCTGCTTGTTAAGAACAATCCCAGTTCTGGTAACTGACCTTCAAAGCTAAAATATAAAACTATTTGGGAAGTATGAAAAAAAAAAAAAAAAAAAAACCGGTTTCCGGATGGGGAGGGCGCCCGGGGGGGGGGCGGGCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG')

print(test_re.start())

119
