# Guide RNA design based on dbSNP rs numbers  

In [1]:
from IPython.core.display import HTML
HTML('''
<style>
    div.prompt {display:none}
    div.cell{
        width:100%;
        margin-left:1%;
        margin-right:1%;
    }
</style>''')

# shift code cells
HTML('''
<style>
    div.input{
        width:100%;
        padding-left:2em;
        padding-right:0em;
    }
</style>''')
# code toggle button
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').show();
 } else {
 $('div.input').hide();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="See usage information at the end! Click here to toggle on/off the raw code. "></form>''')

In [2]:
from Bio import Seq
import re
from itertools import product
import pandas as pd
#  pip3.4  install sqlalchemy --user
# pip3.4  install pymysql --user 
from sqlalchemy import create_engine
import requests
import xml.etree.ElementTree as ET

In [3]:
# funcion to extend ambiguous IUPAC letters. This function generates all possible sequences, 
# expanding the characters defined by the IUPAC standard, expect 'N' (since that would create a huge overhead)
def extend_ambiguous_expectN_dna(seq):
    """return list of all possible sequences given an ambiguous DNA input"""
    d = Seq.IUPAC.IUPACData.ambiguous_dna_values
    d['N'] = 'N' # here we exclude expansion of N
    r = []
    for i in product(*[d[j] for j in seq]):
        r.append("".join(i))
    return r 

In [4]:
# handle Start button click. Runs the motif match and updates the Results
def on_button_clicked(b):
    cleanMotif = re.sub(r"[^ACTGRYSWKMBDHVN]", '',motif.value.upper())
    cleanMotif_rc = str(Seq.Seq(cleanMotif,Seq.IUPAC.ambiguous_dna).reverse_complement())
    # the IUPAC letters are allowed, but not matched in Sequence
    cleanSequence = re.sub(r"[^ACTGRYSWKMBDHVN]", '',sequence.value.upper())
    #print("Cleaned motif:",cleanMotif,"\n")
    #print("Cleaned sequence:\n",cleanSequence)
    #print("\nMatches found:\n")
    res = ''
    res = res + 'browser position '+str(chrom.value)+':'+str(chrPos.value) +'-' + str(chrPos.value+len(cleanSequence))+'\n'
    res = res + 'browser hide all\n'
    res = res + 'track name="ColorByStrandDemo" description="Color by strand demonstration" '+ 'visibility=2 colorByStrand="255,0,0 0,0,255"\n'

    # forward
    i = 1
    for iupacMotif in extend_ambiguous_expectN_dna(cleanMotif):
        regexMotif = re.sub(r"N", '.',iupacMotif)
        #for match in re.finditer(regexMotif, cleanSequence): # finds only non-overlapping matches
        for match in re.finditer(r'(?=('+regexMotif+'))', cleanSequence):
            mm = match.group(1)
            # we need only certain part of the matched sequence, defined by outStart and outLength (0 based indexing)
            idx1 = outStart.value - 1
            idx2 = idx1 + outLength.value
            #print(match.start()+idx1,mm[idx1:idx2], mm)
            res = res +  chrom.value + '\t' + str(chrPos.value+match.start()+idx1) + '\t' + str(chrPos.value+match.start()+idx2-1) + '\t' + 'gRNA_' + str(i) + '_' + str(mm[idx1:idx2]) + '\t0\t+\n'
            #res = res + str(match.start()+idx1+1) +' ' + str(mm[idx1:idx2]) + '+\n'
            i = i+1

    # reverse complement
    i = 1
    for iupacMotif_rc in extend_ambiguous_expectN_dna(cleanMotif_rc):
        regexMotif_rc =  re.sub(r"N", '.',iupacMotif_rc)
        #for match in re.finditer(regexMotif_rc, cleanSequence): # finds only non-overlapping matches
        for match in re.finditer(r'(?=('+regexMotif_rc+'))', cleanSequence):
            mm = match.group(1)
            # we need only certain part of the matched sequence, defined by outStart and outLength (0 based indexing)
            idx2 = len(regexMotif_rc) - (outStart.value - 1)
            idx1 = len(regexMotif_rc) - (outStart.value - 1 + outLength.value)
            #print(match.start()+idx1,mm[idx1:idx2], mm)
            res = res +  chrom.value + '\t' + str(chrPos.value+match.start()+idx1) + '\t' + str(chrPos.value+match.start()+idx2-1) + '\t' + 'gRNArc_' + str(i) + '_' + str(mm[idx1:idx2]) + '\t0\t-\n'
            #res = res + str(match.start()+idx1+1) +' ' + str(mm[idx1:idx2]) + '-\n'
            i = i+1

    resultBox.value = res
    resultBox.update_config
    
def handle_motifDropDown(m):
    idx = motifDropDown.value
    patterns = ['GNNNNNNNNNNNNNNNNNNNNGG','GNNNNNNNNNNNNNNNNNNNNNNNNGRRT','TTTNNNNNNNNNNNNNNNNNNNN']
    startPos = [1,1,5]
    seqLen = [20,23,19] 
    motif.value = patterns[idx]
    outStart.value = startPos[idx]
    outLength.value = seqLen[idx]
    motif.update_config
    outStart.update_config
    outLength.update_config
    return

In [5]:
# handle getRsPosButton button click. Get genomic position from UCSC based on dbSNP rs number
def getDbSnpPosition(b):
    engine = create_engine("mysql+pymysql://genome@genome-mysql.cse.ucsc.edu")

    # UCSC dbSNP table schema: http://ucscbrowser.genap.ca/cgi-bin/hgTables?db=hg38&hgta_group=varRep&hgta_track=snp142Common&hgta_table=snp142Common&hgta_doSchema=describe+table+schema

    query = "select chrom, chromStart, chromEnd, name from hg38.snp142 WHERE name = '" + rsID.value + "'"
    dbSNP_data =  pd.read_sql_query(query, engine)
    chromRS.value = dbSNP_data.ix[0,0] # dbSNP_data['chrom']
    chromRSStart.value = dbSNP_data.ix[0,1] #dbSNP_data['chromStart']
    chromRSEnd.value = dbSNP_data.ix[0,2] #dbSNP_data['chromEnd']
    chromRS.update_config
    chromRSStart.update_config
    chromRSEnd.update_config

In [6]:
# handle getSequence button click. Get sequence from UCSC based on genomic coordinates of the SNP
# Downloads +/- 'window' nucleotides around the chromRSStart position
def getSequenceFromUCSC(b):
    if chromRS.value.find('chr') == -1:
        sequence.value = 'Invalid dbSNP position! Run the above cell first!'
        sequence.update_config
        return
    else:
        # http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr1:100000,200000
        genomeVersion = str(dbSNPname.value).split('.')[0]
        windowLen = seqWindow.value 
        url = 'http://genome.ucsc.edu/cgi-bin/das/' + genomeVersion \
            + '/dna?segment=' + str(chromRS.value) + ':' + str(chromRSStart.value-windowLen) + ',' \
            + str(chromRSStart.value+windowLen)
        response = requests.get(url)
        SeqXml = ET.fromstring(response.text)
        SeqText = SeqXml.find('SEQUENCE').find('DNA').text
        sequence.value = re.sub(r"[^ACTGRYSWKMBDHVN]", '',SeqText.upper())
        sequence.update_config
        # update also the reference coordinates to be used for generating BED
        chrom.value = chromRS.value
        chrPos.value = chromRSStart.value-windowLen
        chrom.update_config
        chrPos.update_config
        return

In [7]:
#%matplotlib inline
#import pandas as pd  
#import matplotlib.pyplot as plt  
from ipywidgets import *  
from IPython.display import display  

# ------ Motif

#legordulo menu (nev + altalanos keplet)
#1. SpCas9 - G-N(19)-NGG
#2. SaCas9 G-N(22)-NNGRRT
#3. AsCpf1 - TTTN-N(19)

#box (szekvencia + tol-ig)
#1. GNNNNNNNNNNNNNNNNNNNNGG (1-20)
#2. GNNNNNNNNNNNNNNNNNNNNNNNNGRRT (1-23)
#3. TTTNNNNNNNNNNNNNNNNNNNN (5-23)

motifDropDown = widgets.Dropdown(
    options={'SpCas9: G-N(19)-NGG': 0, 'SaCas9: G-N(22)-NNGRRT': 1, 'AsCpf1: TTTN-N(19)': 2},
    value=0,
    description='Motif:',
)
motifDropDown.on_trait_change(handle_motifDropDown)

motif = widgets.Text(description='pattern:', value='GNNNNNNNNNNNNNNNNNNNNGG',width='400px')

#TODO: use: max=len(motif.value), but it is not updated when motif is changed 
outStart = widgets.BoundedIntText(description='From:', min=1, max=999999, value=1,width='40px')
outLength = widgets.BoundedIntText(description='Length:', min=1, max=999999, value=20,width='40px')
chrom = widgets.Text(description='Chr:',width='60px')
chrPos = widgets.BoundedIntText(description='ChrStart:', min=0, max=999999999, width='90px')

startButton = widgets.Button(description='Start', 
                             width='100px',height='40px', border_width=10, border_radius = 15, margin=10)
startButton.on_click(on_button_clicked) 
startButton.background_color = "cornflowerblue"
startButton.background_color = "cornflowerblue"
startButton.button_style = 'Info'
startButton.font_weight = 'bold'
startButton.font_size = '14pt'
# --------


# ------------
# Genomic coordinates from RS number
chromRS = widgets.Text(description='Chr:',width='60px')
chromRSStart = widgets.BoundedIntText(description='Start:', min=0, max=999999999, width='90px')
chromRSEnd = widgets.BoundedIntText(description='End:', min=0, max=999999999,  width='90px')
rsID = widgets.Text(description='rs number:', value='rs339331',width='200px', margin=8)
dbSNPname = widgets.Text(description='dbSNP version:', value='hg38.snp142',width='100px', margin=8)
getRsPosButton = widgets.Button(description='Get SNP position', 
                             width='200px',height='40px', border_radius = 15)
getRsPosButton.on_click(getDbSnpPosition) 
getRsPosButton.background_color = "cornflowerblue"
getRsPosButton.background_color = "cornflowerblue"
getRsPosButton.button_style = 'Info'
getRsPosButton.font_weight = 'bold'
getRsPosButton.font_size = '14pt'

pageRS = widgets.Box(children=[widgets.HBox(children=[rsID,dbSNPname,getRsPosButton]),widgets.HBox(children=[chromRS,chromRSStart,chromRSEnd])])
#chrom,chrPos
accordRS = widgets.Accordion(children=[pageRS], width=800)
accordRS.set_title(0, 'Genomic position based on dbSNP rs name')
# ------------


page0 = widgets.Box(children=[widgets.HBox(children=[motifDropDown, motif,outStart,outLength],margin=8),widgets.HBox(children=[chrom,chrPos],margin=8)])
#chrom,chrPos
accord0 = widgets.Accordion(children=[page0], width=800)
accord0.set_title(0, 'Motif')

# ------------
sequence = widgets.Textarea(description='', width=800,height=400)
seqWindow = widgets.BoundedIntText(description='+/- window size:', min=1, max=999999, value=20,width='60px', margin=8)
getSeq = widgets.Button(description='Get sequence', 
                             width='200px',height='40px', border_radius = 15)
getSeq.on_click(getSequenceFromUCSC) 
getSeq.background_color = "cornflowerblue"
getSeq.background_color = "cornflowerblue"
getSeq.button_style = 'Info'
getSeq.font_weight = 'bold'
getSeq.font_size = '14pt'
page1 = widgets.Box(children=[widgets.HBox(children=[seqWindow,getSeq]),sequence])
accord1 = widgets.Accordion(children=[page1], width=400)
accord1.set_title(0, 'Sequence around SNP position')
# ------------

resultBox = widgets.Textarea(description='', width=800,height=400)
accord2 = widgets.Accordion(children=[resultBox], width=400)
accord2.set_title(0, 'Results')

display(accordRS,accord1,accord0,startButton,accord2)



This tool takes a genetic _Sequence_ and looks for patterns which match the given _Motif_. In the motif 'N' character matches any letter in the sequence. User can truncate the displayed matching sequences by setting the _From_ and _Length_ fields.   
After pressing the _Start_ button, on the output the truncated matcing sequences, and their positions are listed.  

- IUPAC characters, other than ACGT are handled in Motif, but not in Sequence
- all lower case characters in the sequence are converted to upper case
- all characters except 'A','C','G','T' (including whitespaces and linebreake) will be removed from Sequence
- all characters except 'A','C','G','T' and IUPAC characters: RYSWKMBDHV, and 'N' - which matches any base will be removed from Motif
- !!! output BED files use 0-based indexing!!! See: https://genome.ucsc.edu/FAQ/FAQformat.html#format1
