# Guide RNA design  

In [1]:
from Bio import Seq
import re
from itertools import product 
from ipywidgets import *  
from IPython.display import display

In [2]:
exampleSequence = "TGAGAGAAGAGTTTCATATTTGCAAGGTCTCAGACATGCCTTTAAAATTTCATAATACTTCTTCTTGTGTTTCCCATGCTTGATGGGGCTCAGAATTGACAGTGTACATTTTCAGTAATACAGAGATGAGAAGGGTCAGAGAGGAAGTAAAGTGGCTAGGACTGGCTATGTAAATTGCCAAGGGGTGCAGTTCAAATGAAAATGTGAGACCTCTAATATAAAATTTCATGATGGGGATAATGGATCATGAAACTAATCGTGGGGCTTTGTGCGACACCACGGGACACTTGCCTACAAAGCCATCCCTACTCCCTGCTTAAACGGAGACAAGAACACATTGGTGATTCTCAAATGCAAACTGTCCCTTTACTGAAAACCTAATGAGTTTAATGCTTACTATACTGTAGTCCCTTATGTGCTGATTACATTGTGGAATGCTGCAGGGAGAAAAACAAATTCACCTAATGATGCTAAAGAACCATTAGGAGACTTTACTAGTTTAGGTCACAAGTGCCTAGAAATACAGAAATGATCTTTGACCTTTCCTCCATTTTTCAAAAGGACGCTATTTCTGTGAACCTCATTGACCTCTCCAGACAAAGTCCCAGATTGCCTTTGCCAAAGTTAACAACCCATCTTGGCTTCTTGCCATTTCTCATGTATTCACTAGATCAGTGCATGACTGCTGCTCTCAACCCACATCATGGGGCCAGAGCCTTCACTGGTTAATATTTATAAAAAAATACTTTGAAGATTAAATCCTTGGTCAGTAGAGAAAACACTAGACATGGATAGAACAAGAAATTGTGGGGTCTGGGCCTCCTCCCAGAACTGCCACCACCAGACAATGTTATCTTTGACAGATTTGTGGTATCTGGGTGGCTGTACTTTTCTTTTGGTGAAATAGCAAAAGCCAAAAAAAGAGACTGTAACATCTGATGGGATTTTGCCCAGCTCCAGTACTGAATTCCTGGGTTGCCTTAGAGGAAAGGTATTGACTGTTTCATCTCTTTGGCCCTCAGTATACTCCCCCAACCCAAATCCACCCCTCCGTCATACACACATCTTACTCTGGAAATGATTCTTAGCAGGAAAACATACTATTATGAGGAACATACTTAGCAGTGCTCTCATAACCCTGCCAAGGGCTGGCATTTTGATTTTGTTTGCACACAGAAACCTGTTTGACTGAAATCTGGAAAAAAAAAAAAAAAAAAAAGAAAAACCAGACACGGGAGAAAAATAAAAATCACAGTAGTTTTATACATACATTTGATTTCTAAGGCCAGATAACTGATTTTGTTACCATTTTCTATCCTGTTATTCTGGTGCTGCGAATGCACTAGCTAGAAGTAAAAGTTGTGGGGAAAGGAAGAAGTTGGTTCTGAGTTTATTTTCCCCAGGGTTTCTGAGGGAGTTCCACCCTCTCCCCTGAAAGGGAGTTGGGGCAAATCTATGTTTCTAATTCCACATCAAGTGTAACTATAGCCTGTTCAGAATATTCCTCTTTGAAACAGCACTTTGAGAACAGGGAGCTGCAGCAGTGCAAGACAGCTTGTGGTTGGCTTGGGACTTTAGCCTCAACCCAAATTTGGAACTGGGAACATCTCCAGTCAGCTCTATGGCTAGTCTCACCTTCCTGATCAGCCACTTGGGCGGGCTGTAAGCCTGGCTAAACCAGGTCACCTCAACATTTGTGTCAGGGCAAAGAGTGGAGGAGAGAAGGTGTTTCTTCAACTCCTGGCTTTTAGCTGTGCCTGGATGCCTGCATACTGTTAAAATTCAGCTGCCAGCCTTATCTTATCCGTAATCAGTTGAATTTATTACAATAATCTGATTTTAGAAGCCCCTGAGGCATGGCTTGAGGTCCTATATTGACTTAGGCCCTTGGCTCTGAGGCATTAGAGTCCTGAATTGACTTTAAAGAGGCATAGAAACATAGATGTCTACCAAAGGCTTCAAACTGGCCAACTCTCAGTCTATTTCAACATGCCCTACTGAGCCAATCACTCTGTGCCAAGCCAGTACTAAGAACTGGGGGGACAAAGTGGACTGAGTCTACTCCCTGCTCTTCAGAAGCTCAAAGGCTGTTTATGAAACCAGAGAAGTAAATATATGCTTTCAATATAATGTGATTAATTATAGGCTAGAAATGAGATTGAATGCCTTGAAAGCACTGAGGGTGAGGATCCAACACAACCTAGGGGCATCGGGGAAGACTTCCTGTAAAAGGGATCGCCAGGTAGGGTTGAAAGTAGTTGGGGGGAAACAGTGAGAAATAAATGCCAAAGAAGAAGCAGAAAAAGCAAAGGCAAGGAGGTCTCTTACAGCACTCTATGCTCAGGAAATTATAATGAGTCAGGATAAGTAAACCT"

In [3]:
# funcion to extend ambiguous IUPAC letters. This function generates all possible sequences, 
# expanding the characters defined by the IUPAC standard, expect 'N' (since that would create a huge overhead)
def extend_ambiguous_expectN_dna(seq):
    """return list of all possible sequences given an ambiguous DNA input"""
    d = Seq.IUPAC.IUPACData.ambiguous_dna_values
    d['N'] = 'N' # here we exclude expansion of N
    r = []
    for i in product(*[d[j] for j in seq]):
        r.append("".join(i))
    return r 

In [4]:
def on_button_clicked(b):
    # the IUPAC letters are allowed, but not matched in Sequence
    cleanSequence = re.sub(r"[^ACTGRYSWKMBDHV]", '',sequence.value.upper())
    res = ''
    res = res + 'browser position '+str(chrom.value)+':'+str(chrPos.value) +'-' + str(chrPos.value+len(cleanSequence))+'\n'
    #res = res + 'browser hide all\n'
    res = res + 'track name="ColorByStrandDemo" description="Color by strand demonstration" '+ 'visibility=2 colorByStrand="255,0,0 0,0,255"\n'

    rsMotifMatchCount = 0
    # we have the sequence, iterate through motifs
    for motifLine in motifBox.value.split('\n'):
        motifDesc = motifLine.split(',')
        motifName = motifDesc[0]
        motif = motifDesc[1]
        motifStart = int(motifDesc[2])
        motifLen = int(motifDesc[3])
        cleanMotif = re.sub(r"[^ACTGRYSWKMBDHVN]", '',motif.upper())
        cleanMotif_rc = str(Seq.Seq(cleanMotif,Seq.IUPAC.ambiguous_dna).reverse_complement())
        # forward
        i = 1
        for iupacMotif in extend_ambiguous_expectN_dna(cleanMotif):
            regexMotif = re.sub(r"N", '.',iupacMotif)
            #for match in re.finditer(regexMotif, cleanSequence): # finds only non-overlapping matches
            for match in re.finditer(r'(?=('+regexMotif+'))', cleanSequence):
                mm = match.group(1)
                # we need only certain part of the matched sequence, defined by outStart and outLength (0 based indexing)
                idx1 = motifStart - 1
                idx2 = idx1 + motifLen
                #print(match.start()+idx1,mm[idx1:idx2], mm)
                res = res +   chrom.value + '\t' + str(chrPos.value+match.start()+idx1) + \
                            '\t' + str(chrPos.value+match.start()+idx2-1) + \
                            '\t' + motifName + '_' + 'gRNA_' +'_'+ str(i) + '_' + str(mm[idx1:idx2]) + '\t0\t+\n'
                i = i+1

        # reverse complement
        i = 1
        for iupacMotif_rc in extend_ambiguous_expectN_dna(cleanMotif_rc):
            regexMotif_rc =  re.sub(r"N", '.',iupacMotif_rc)
            #for match in re.finditer(regexMotif_rc, cleanSequence): # finds only non-overlapping matches
            for match in re.finditer(r'(?=('+regexMotif_rc+'))', cleanSequence):
                mm = match.group(1)
                # we need only certain part of the matched sequence, defined by outStart and outLength (0 based indexing)
                idx2 = len(regexMotif_rc) - (motifStart - 1)
                idx1 = len(regexMotif_rc) - (motifStart - 1 + motifLen)
                strRC = str(Seq.Seq(str(mm[idx1:idx2]),Seq.IUPAC.ambiguous_dna).reverse_complement())
                res = res +   chrom.value + '\t' + str(chrPos.value+match.start()+idx1) + \
                            '\t' + str(chrPos.value+match.start()+idx2-1) + \
                            '\t' + motifName + '_' + 'gRNArc_' +'_'+ str(i) + \
                            '_' + str(mm[idx1:idx2]) + \
                            '\t0\t-\n'
                # '_' + strRC + '_' + \ ??? instead of  '_' + str(mm[idx1:idx2]) + \ above ?   
            i = i+1

    resultBox.value = res
    resultBox.update_config
    
def handle_motif(m):
    return


In [5]:
motifText = 'SpCas9,GNNNNNNNNNNNNNNNNNNNNGG,1,20'
motifText = motifText + '\n' + 'Sa23Cas9,GNNNNNNNNNNNNNNNNNNNNNNNNGRRT,1,23'
motifText = motifText + '\n' + 'Sa22Cas9,GNNNNNNNNNNNNNNNNNNNNNNNGRRT,1,22'
motifText = motifText + '\n' + 'Sa21Cas9,GNNNNNNNNNNNNNNNNNNNNNNGRRT,1,21'
motifText = motifText + '\n' + 'AsCpf1,TTTNNNNNNNNNNNNNNNNNNNN,5,19'

In [6]:
tBox_layout = Layout(height='400px',width='800px')
smallBox_layout = Layout(width='200px')
miniBox_layout = Layout(width='40px')

motifBox = widgets.Textarea(description='', value=motifText, layout=tBox_layout)

chrom = widgets.Text(description='Chr:', value='chrX',width='20%')
chrPos = widgets.BoundedIntText(description='ChrStart:', min=1, max=999999999, value=67523514,width='20%')

startButton = widgets.Button(description='Start',  button_style='info')
startButton.on_click(on_button_clicked) 


page0 = widgets.Box(children=[widgets.HBox(children=[motifBox]),widgets.HBox(children=[chrom,chrPos])])
#chrom,chrPos
accord0 = widgets.Accordion(children=[page0], width=800, height=400)
accord0.set_title(0, 'MotifID,motif,start,length')

sequence = widgets.Textarea(description='', width='90%',height='500px', value=exampleSequence)
page1 = widgets.Box(children=[sequence])
accord1 = widgets.Accordion(children=[page1], width=800, height=400)
accord1.set_title(0, 'Sequence')

resultBox = widgets.Textarea(description='', width='90%',height='500px', value='')
page2 = widgets.Box(children=[resultBox])
accord2 = widgets.Accordion(children=[page2], width=800, height=400)
accord2.set_title(0, 'Results')

display(startButton,accord0,accord1,accord2)



This tool takes a genetic _Sequence_ and looks for patterns which match the given _Motif_. In the motif 'N' character matches any letter in the sequence. User can truncate the displayed matching sequences by setting the _From_ and _Length_ fields.   
After pressing the _Start_ button, on the output the truncated matcing sequences, and their positions are listed.  

- IUPAC characters, other than ACGT are handled in Motif, but not in Sequence
- all lower case characters in the sequence are converted to upper case
- all characters except 'A','C','G','T' (including whitespaces and linebreake) will be removed from Sequence
- all characters except 'A','C','G','T' and IUPAC characters: RYSWKMBDHV, and 'N' - which matches any base will be removed from Motif
- !!! output BED files use 0-based indexing!!! See: https://genome.ucsc.edu/FAQ/FAQformat.html#format1
