1. Learning sgRNA predictors from empirical data
    * Load scripts and empirical data
    * Generate TSS annotation using FANTOM dataset
    * Calculate parameters for empirical sgRNAs
    * Fit parameters
2. Applying machine learning model to predict sgRNA activity
    * Find all sgRNAs in genomic regions of interest 
    * Predicting sgRNA activity
3. Construct sgRNA libraries
    * Score sgRNAs for off-target potential
* Pick the top sgRNAs for a library, given predicted activity scores and off-target filtering

# 1. Learning sgRNA predictors from empirical data
## Load scripts and empirical data

In [None]:
#modify below variable to point to the base directory containing large genome data files large_data_files directory.
#LARGE_FILE_DIR= '../../../../data/'
LARGE_FILE_DIR= './'

FASTA_FILE_OF_GENOME= LARGE_FILE_DIR + 'large_data_files/hg19.fa'
GTF_FILE_FROM_GENCODE = LARGE_FILE_DIR + 'large_data_files/gencode.v19.annotation.gtf'

TSS_TABLE_PATH='data_files/human_tssTable.txt'
P1P2_TABLE_PATH='data_files/human_p1p2Table.txt'

FANTOM_TSS_ANNOTATION_BED= LARGE_FILE_DIR + 'large_data_files/TSS_human.sorted.bed.gz'
HGNC_SYMBOL_LOOKUP_TABLE= LARGE_FILE_DIR + 'large_data_files/hgnc_complete_set_2020-08-01.txt'

#spreadsheet containing the lab experiment data to train the model
IGEM_EXCEL_FILE= 'data_files/Sam_Perli_qRT_pcr_data_per_gene.xlsx'


PICKLE_FILE = 'igem_v1_estimator'
TRANSFORMED_PARAM_HEADER='igem_v1_transformed_param_header'

PREDICTED_SCORE_TABLE='igem_v1_predicted_score_table'
TEMP_FASTQ_FILE='igem_v1_temp_fastq_file'


%run sgRNA_learning.py

In [None]:
#read the experiment data
df=pd.read_excel(open(IGEM_EXCEL_FILE, 'rb'),
              sheet_name='Sheet1')  
df =df.dropna( subset = ['sgID'])

df_research = df.set_index('sgID')


In [None]:
df_research

In [None]:
libraryTable_research = df[['sgID','gene','transcript','protospacer sequence']]
libraryTable_research= libraryTable_research.loc[1:,].set_index('sgID').rename(columns={'protospacer sequence':'sequence','transcript':'transcripts'})

#Exclude rows relating to the following gene so that we can calculate scores once we train our model
GENE_TO_EXCLUDE= 'EIF4G1'

#Select all genes except last one
libraryTable_subset = libraryTable_research[ libraryTable_research['gene'] != GENE_TO_EXCLUDE ]
#Read sgID
sgInfoTable = parseAllSgIds(libraryTable_subset)

#Read scores measured in the lab
normedScores = df[['sgID','gene','Measured by Dr. Perli']].loc[1:,]
normedScores = normedScores [ normedScores['gene'] != GENE_TO_EXCLUDE]

#normedScores['Measured by Dr. Perli']= np.log10(1/normedScores['Measured by Dr. Perli'])
#For the CRISPRi scores measured in the Dr. Perli's lab with qRT-PCR lower the score better the guide, but in our model we will be predicting in such a way that better the score better the guide
#Subtract the scores measured in lab from 1
normedScores['Measured by Dr. Perli']= 1-normedScores['Measured by Dr. Perli']
normedScores = normedScores [['sgID','Measured by Dr. Perli']].set_index('sgID').rename(columns={'Measured by Dr. Perli':''})


In [None]:
sgInfoTable

In [None]:
normedScores

In [None]:
#Load genome data
genomeDict = loadGenomeAsDict(FASTA_FILE_OF_GENOME)
gencodeData = loadGencodeData(GTF_FILE_FROM_GENCODE)

## Read TSS annotation generated using FANTOM dataset

In [None]:
import ast

tssTable = pd.read_csv(TSS_TABLE_PATH,sep='\t', index_col=[0,1])
p1p2Table = pd.read_csv(P1P2_TABLE_PATH,sep='\t', header=0, index_col=[0,1], converters={"primary TSS": ast.literal_eval, "secondary TSS": ast.literal_eval})

p1p2Table.head()



## Calculate parameters for empirical sgRNAs

In [None]:
#Load bigwig files for any chromatin data of interest
bwhandleDict = {'dnase':BigWigFile(open(LARGE_FILE_DIR + 'large_data_files/wgEncodeOpenChromDnaseK562BaseOverlapSignalV2.bigWig','rb')),
'faire':BigWigFile(open(LARGE_FILE_DIR + 'large_data_files/wgEncodeOpenChromFaireK562Sig.bigWig','rb')),
'mnase':BigWigFile(open(LARGE_FILE_DIR + 'large_data_files/wgEncodeSydhNsomeK562Sig.bigWig','rb'))}

In [None]:
paramTable_trainingGuides = generateTypicalParamTableEx(libraryTable_subset,sgInfoTable, tssTable, p1p2Table, genomeDict, bwhandleDict)

In [None]:
paramTable_trainingGuides.head()

In [None]:
paramTable_trainingGuides.describe()

## Fit parameters

In [None]:
#populate table of fitting parameters
typeList = ['binnable_onehot', 
            'continuous', 'continuous', 'continuous', 'continuous',
            'continuous', 'continuous', 'continuous', 'continuous',
            'binnable_onehot','binnable_onehot','binnable_onehot','binnable_onehot',
            'binnable_onehot','binnable_onehot','binnable_onehot','binnable_onehot','binnable_onehot','binnable_onehot','binnable_onehot',
            'binary']
typeList.extend(['binary']*160)
typeList.extend(['binary']*(16*38))
typeList.extend(['binnable_onehot']*3)
typeList.extend(['binnable_onehot']*2)
typeList.extend(['binary']*18)
fitTable = pd.DataFrame(typeList, index=paramTable_trainingGuides.columns, columns=['type'])
MIN_EDGE_DATA=10
fitparams =[{'bin width':1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'C':[.01,.05, .1,.5], 'gamma':[.000001, .00005,.0001,.0005]},
            {'C':[.01,.05, .1,.5], 'gamma':[.000001, .00005,.0001,.0005]},
            {'C':[.01,.05, .1,.5], 'gamma':[.000001, .00005,.0001,.0005]},
            {'C':[.01,.05, .1,.5], 'gamma':[.000001, .00005,.0001,.0005]},
            {'C':[.01,.05, .1,.5], 'gamma':[.000001, .00005,.0001,.0005]},
            {'C':[.01,.05, .1,.5], 'gamma':[.000001, .00005,.0001,.0005]},
            {'C':[.01,.05, .1,.5], 'gamma':[.000001, .00005,.0001,.0005]},
            {'C':[.01,.05, .1,.5], 'gamma':[.000001, .00005,.0001,.0005]},
            {'bin width':1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':.1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':.1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':.1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':.1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':.1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':.1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':.1, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            dict()]
fitparams.extend([dict()]*160)
fitparams.extend([dict()]*(16*38))
fitparams.extend([
            {'bin width':.15, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':.15, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':.15, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median}])
fitparams.extend([
            {'bin width':2, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median},
            {'bin width':2, 'min edge data':MIN_EDGE_DATA, 'bin function':np.median}])
fitparams.extend([dict()]*18)
fitTable['params'] = fitparams

In [None]:
#Generate random gene folds containing training and test data. We will select the best estimater that got generated out of these random sets
geneFoldList = getGeneFoldsEx(libraryTable_subset, 17, transcripts=False)

In [None]:
#for each fold, fit parameters to training folds and measure R^2 of test scores
#Use Elastic Net CV linear regression
coefs = []
metricTups = []

for geneFold_train, geneFold_test in geneFoldList:

    transformedParams_train, estimators = fitParams(paramTable_trainingGuides.loc[normedScores.dropna().index].iloc[geneFold_train], normedScores.loc[normedScores.dropna().index].iloc[geneFold_train], fitTable)

    transformedParams_test = transformParams(paramTable_trainingGuides.loc[normedScores.dropna().index].iloc[geneFold_test], fitTable, estimators)
    
    reg = linear_model.ElasticNetCV(l1_ratio=[.5, .75, .9, .99,1], n_jobs=16, max_iter=2000)
    
    scaler = preprocessing.StandardScaler()
    reg.fit(scaler.fit_transform(transformedParams_train), normedScores.loc[normedScores.dropna().index].iloc[geneFold_train])
    predictedScores = pd.Series(reg.predict(scaler.transform(transformedParams_test)), index=transformedParams_test.index)
    testScores = normedScores.loc[normedScores.dropna().index].iloc[geneFold_test]
    

   
#    print ('Prediction AUC-ROC:', metrics.roc_auc_score((testScores >= .75).values, np.array(predictedScores.values,dtype='float64')))
    R2Score = reg.score(scaler.transform(transformedParams_test), testScores)
    print ('Prediction R^2:', R2Score )
    print ('Regression parameters:', reg.l1_ratio_, reg.alpha_)
    coefs.append(pd.DataFrame(zip(*[abs(reg.coef_),reg.coef_]), index = transformedParams_test.columns, columns=['abs','true']))
    numFeatures = len(coefs[-1]) - sum(coefs[-1]['abs'] < .00000000001)
    print ('Number of features used:', numFeatures)
    
    
    metricTups.append((R2Score,numFeatures,geneFold_train,reg,scaler,(testScores,predictedScores),geneFold_test))
    

In [None]:
transformedParams_train.head()


In [None]:
#sort the folds based on R^2 and features and select the top-most for our model
metricTupsSorted= sorted(metricTups, key= lambda element: (element[0], element[1]), reverse=True)
   

geneFold_train = metricTupsSorted[0][2]
reg = metricTupsSorted[0][3]
scaler = metricTupsSorted[0][4]
scoreTups = metricTupsSorted[0][5]
geneFold_test= metricTupsSorted[0][6]

#just recalculate the estimator and transformedParams_train
transformedParams_train, estimators = fitParams(paramTable_trainingGuides.loc[normedScores.dropna().index].iloc[geneFold_train], normedScores.loc[normedScores.dropna().index].iloc[geneFold_train], fitTable)




In [None]:
print ('R^2 coefficient value selected: ', metricTupsSorted[0][0])
print ('Number of Features: ', metricTupsSorted[0][1])

In [None]:
#print the features utilized by model
print(transformedParams_train.iloc[:,reg.coef_!=0].columns)

In [None]:
#print the coefficients of the features calculated by regression algorithm
result= filter(lambda x: x != 0, reg.coef_)
print (list(result))

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math

dfPlot = transformedParams_train.filter(like='TSS')



total_items = len(dfPlot.columns)
items_per_row = 3
total_rows = math.ceil(total_items / items_per_row)
fig = make_subplots(rows=total_rows, cols=items_per_row)
cur_row = 1
cur_col = 1
for index, column in enumerate(dfPlot.columns):
    fig.add_trace(go.Box(y=dfPlot[column], name=str(column)), row=cur_row, col=cur_col)
    
    if cur_col % items_per_row == 0:
        cur_col = 1
        cur_row = cur_row + 1
    else:
        cur_col = cur_col + 1
    
fig.update_layout(height=1000, width=1000,  showlegend=False)
fig.show() 


In [None]:
titles = []
for t in dfPlot.columns.values:
    titles.append(str(t))
normedScoresPlot=normedScores.loc[normedScores.dropna().index].iloc[geneFold_train]


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math
import numpy as np
total_items = len(dfPlot.columns)
items_per_row = 3
total_rows = math.ceil(total_items / items_per_row)
fig = make_subplots(rows=total_rows, cols=items_per_row, subplot_titles=titles)
cur_row = 1
cur_col = 1
for index, column in enumerate(dfPlot.columns):
    fig.add_trace(go.Scattergl(x=dfPlot[column], 
                            y=normedScoresPlot.iloc[:,0], 
                            mode="markers", 
                            marker=dict(size=3)), 
                  row=cur_row, 
                  col=cur_col)
    
    intercept = np.poly1d(np.polyfit(dfPlot[column], normedScoresPlot.iloc[:,0], 1))(np.unique(dfPlot[column]))
    
    fig.add_trace(go.Scatter(x=np.unique(dfPlot[column]), 
                             y=intercept, 
                             line=dict(color='red', width=1)), 
                  row=cur_row, 
                  col=cur_col)
    
    if cur_col % items_per_row == 0:
        cur_col = 1
        cur_row = cur_row + 1
    else:
        cur_col = cur_col + 1
    
fig.update_layout(height=1000, width=1000, showlegend=False)
print("X axis: Distance of guide from TSS, Y axis: Score of the guide")
fig.show()


In [None]:
#select the best fold
#the pickling of the scikit-learn estimators/regressors will allow the model to be reloaded for prediction of other guide designs, 
#   but will not be compatible across scikit-learn versions, so it is important to preserve the training data and training/test folds

import _pickle as cPickle

estimatorString = cPickle.dumps((fitTable, estimators, scaler, reg, (geneFold_train, geneFold_test)))
with open(PICKLE_FILE,'wb') as outfile:
    outfile.write(estimatorString)
    
#also save the transformed parameters as these can slightly differ based on the automated binning strategy

transformedParamsTrainHead = cPickle.dumps(transformedParams_train.head())

with open(TRANSFORMED_PARAM_HEADER,'wb') as paramfile:
    paramfile.write(transformedParamsTrainHead)

# 2. Applying machine learning model to predict sgRNA activity

In [None]:
#starting from a new session for demonstration purposes:
%run sgRNA_learning.py
import _pickle as cPickle
#import cPickle

#load tssTable, p1p2Table, genome sequence, chromatin data
tssTable = pd.read_csv(TSS_TABLE_PATH,sep='\t', index_col=[0,1])

p1p2Table = pd.read_csv(P1P2_TABLE_PATH,sep='\t', header=0, index_col=[0,1])
p1p2Table['primary TSS'] = p1p2Table['primary TSS'].apply(lambda tupString: (int(float(tupString.strip('()').split(', ')[0])), int(float(tupString.strip('()').split(', ')[1]))))
p1p2Table['secondary TSS'] = p1p2Table['secondary TSS'].apply(lambda tupString: (int(float(tupString.strip('()').split(', ')[0])),int(float(tupString.strip('()').split(', ')[1]))))

genomeDict = loadGenomeAsDict(FASTA_FILE_OF_GENOME)

bwhandleDict = {'dnase':BigWigFile(open(LARGE_FILE_DIR + 'large_data_files/wgEncodeOpenChromDnaseK562BaseOverlapSignalV2.bigWig','rb')),
'faire':BigWigFile(open(LARGE_FILE_DIR + 'large_data_files/wgEncodeOpenChromFaireK562Sig.bigWig','rb')),
'mnase':BigWigFile(open(LARGE_FILE_DIR + 'large_data_files/wgEncodeSydhNsomeK562Sig.bigWig','rb'))}

#load sgRNA prediction model saved after the parameter fitting step
with open(PICKLE_FILE,'rb') as infile:
    fitTable, estimators, scaler, reg, (geneFold_train, geneFold_test) = cPickle.load(infile)
    
#transformedParamHeader = pd.read_csv(TRANSFORMED_PARAM_HEADER,sep='\t')

#iGEM read the binary file
#transformedParamHeader = pd.read_csv(TRANSFORMED_PARAM_HEADER,sep='\t')

with open(TRANSFORMED_PARAM_HEADER,'rb') as paraminfile:
    transformedParamHeader = cPickle.load(paraminfile)

transformedParams_train = transformedParamHeader

## Find all sgRNAs in genomic regions of interest 

In [None]:
#For IGEM for now try to score the Gene that we excluded from the training data set

#Select excluded gene fron training set
libraryTable_new = libraryTable_research
libraryTable_new = libraryTable_research[ libraryTable_research['gene'] == GENE_TO_EXCLUDE ]

#libraryTable_subset = libraryTable_research
sgInfoTable_new = parseAllSgIds(libraryTable_new)

In [None]:
libraryTable_new.head()


## Predicting sgRNA activity

In [None]:
#calculate parameters for new sgRNAs
paramTable_new = generateTypicalParamTableEx(libraryTable_new, sgInfoTable_new, tssTable, p1p2Table, genomeDict, bwhandleDict)

In [None]:
#transform and predict scores according to sgRNA prediction model
transformedParams_new = transformParams(paramTable_new, fitTable, estimators)

#reconcile any differences in column headers generated by automated binning
colTups = []
for (l1, l2), col in transformedParams_new.iteritems():
    colTups.append((l1,str(l2)))
transformedParams_new.columns = pd.MultiIndex.from_tuples(colTups)
#iGEM in python 3 .loc with missing column headers is giving issues. So changing it to reindex # can use transformedParams_train if running sequentially otherwise use transformedParamHeader after running above step
predictedScores_new = pd.Series(reg.predict(scaler.transform(transformedParams_new.reindex(columns=transformedParams_train.columns).fillna(0).values)), index=transformedParams_new.index)

In [None]:
predictedScores_new.head()

In [None]:
#Run this cell to create a comparison matrix of the scores measured in the lab: Weissman v iGEM model
compData = df_research
#compData=compData.set_index('sgID')
compData = compData.loc[predictedScores_new.index]
compData['Measured by Dr. Perli']= 1-compData['Measured by Dr. Perli']
compData['Dr. Perli rank']=compData['Measured by Dr. Perli'].rank(ascending=False)
compData['iGem score']=predictedScores_new
compData['iGem rank']=compData['iGem score'].rank(ascending=False)
compData.filter(regex='sgId')
compData= compData.filter(regex='gene|.*rank|.*score|Me.*|.*rank')
compData


In [None]:
from sklearn.metrics import mean_tweedie_deviance

lab_scores =compData['Measured by Dr. Perli'].values
weissman_scores = compData['predicted score'].values
iGem_scores = compData['iGem score'].values


print ('Compare Deviance of Weissman and iGem From Lab Scores Lower the value, better the model\n')
print ('Weismann: ', mean_tweedie_deviance(lab_scores, weissman_scores,power=0))
print ('iGEM:     ', mean_tweedie_deviance(lab_scores, iGem_scores,power=0))


In [None]:
predictedScores_new.to_csv(PREDICTED_SCORE_TABLE, sep='\t')

# 3. Construct sgRNA libraries
## Score sgRNAs for off-target potential

In [None]:
#There are many ways to score sgRNAs as off-target; below is one listed one method that is simple and flexible,
#but ignores gapped alignments, alternate PAMs, and uses bowtie which may not be maximally sensitive in all cases

In [None]:
#iGEM the sequence length can be greather 22. So just added more pluses and adjusteed its length equal to sequence length.
#iGEM Revisit to fix phred length properly.

#output all sequences to a temporary FASTQ file for running bowtie alignment
def outputTempBowtieFastq(libraryTable, outputFileName):
    phredString = 'I4!=======44444++++++++++++++++' #weighting for how impactful mismatches are along sgRNA sequence 
    with open(outputFileName,'w') as outfile:
        for name, row in libraryTable.iterrows():
            outfile.write('@' + name + '\n')
            outfile.write('CCN' + str(Seq.Seq(row['sequence'][1:]).reverse_complement()) + '\n')
            outfile.write('+\n')
            outfile.write(phredString[0:3+len(str(Seq.Seq(row['sequence'][1:]).reverse_complement()))] + '\n')
            
outputTempBowtieFastq(libraryTable_new, TEMP_FASTQ_FILE)

In [None]:
import subprocess
fqFile = TEMP_FASTQ_FILE

#specifying a list of parameters to run bowtie with
#each tuple contains
# *the mismatch threshold below which a site is considered a potential off-target (higher is more stringent)
# *the number of sites allowed (1 is minimum since each sgRNA should have one true site in genome)
# *the genome index against which to align the sgRNA sequences; these can be custom built to only consider sites near TSSs
# *a name for the bowtie run to create appropriately named output files

#iGEM
#alignmentList = [(39,1,'~/indices/hg19.ensemblTSSflank500b','39_nearTSS'),
#                (31,1,'~/indices/hg19.ensemblTSSflank500b','31_nearTSS'),
#                (21,1,'~/indices/hg19.maskChrMandPAR','21_genome'),
#                (31,2,'~/indices/hg19.ensemblTSSflank500b','31_2_nearTSS'),
#                (31,3,'~/indices/hg19.ensemblTSSflank500b','31_3_nearTSS')]

#iGEM   ChrM and PAR are vary small part of hg19. So running alignment for entire genome.

alignmentList = [(39,1,LARGE_FILE_DIR+'large_data_files/indices/hg19.ensemblTSSflank500b','39_nearTSS'),
                (31,1,LARGE_FILE_DIR+'large_data_files/indices/hg19.ensemblTSSflank500b','31_nearTSS'),
                #(21,1,'/data/large_data_files/indices/hg19_maskChrMandPAR','21_genome'),
                (21,1,LARGE_FILE_DIR+'large_data_files/indices/hg19','21_genome'),
                (31,2,LARGE_FILE_DIR+'large_data_files/indices/hg19.ensemblTSSflank500b','31_2_nearTSS'),
                (31,3,LARGE_FILE_DIR+'large_data_files/indices/hg19.ensemblTSSflank500b','31_3_nearTSS')]
import os
import errno
if not os.path.exists('bowtie_output'):
    os.makedirs('bowtie_output')

alignmentColumns = []
for btThreshold, mflag, bowtieIndex, runname in alignmentList:

    alignedFile = 'bowtie_output/' + runname + '_aligned.txt'
    unalignedFile = 'bowtie_output/' + runname + '_unaligned.fq'
    maxFile = 'bowtie_output/' + runname + '_max.fq'
    
    bowtieString = 'bowtie -n 3 -l 15 -e '+str(btThreshold)+' -m ' + str(mflag) + ' --nomaqround -a --tryhard -p 16 --chunkmbs 256 ' + bowtieIndex + ' --suppress 5,6,7 --un ' + unalignedFile + ' --max ' + maxFile + ' '+ ' -q '+fqFile+' '+ alignedFile
    print (bowtieString)
    print (subprocess.call(bowtieString, shell=True))

    #parse through the file of sgRNAs that exceeded "m", the maximum allowable alignments, and mark "True" any that are found
    sgsAligning = set()
    
    try:
        with open(maxFile) as infile:
            sgsAligning = set()
            for i, line in enumerate(infile):
                if i%4 == 0: #id line
                    sgsAligning.add(line.strip()[1:])
    except OSError as e:
        if e.errno != errno.ENOENT:
            raise

    alignmentColumns.append(libraryTable_new.apply(lambda row: row.name in sgsAligning, axis=1))
#iGEM zip is an object in python3    
#collate results into a table, and flip the boolean values to yield the sgRNAs that passed filter as True
alignmentTable = pd.concat(alignmentColumns,axis=1, keys=list(zip(*alignmentList))[3]).ne(True)

## Pick the top sgRNAs for a library, given predicted activity scores and off-target filtering

In [None]:
#combine all generated data into one master table
predictedScores_new.name = 'predicted score'
v2Table = pd.concat((libraryTable_new, predictedScores_new, alignmentTable, sgInfoTable_new), axis=1, keys=['library table v2', 'predicted score', 'off-target filters', 'sgRNA info'])

In [None]:
v2Table

In [None]:
import re
#for our pCRISPRi/a-v2 vector, we append flanking sequences to each sgRNA sequence for cloning and require the oligo to contain
#exactly 1 BstXI and BlpI site each for cloning, and exactly 0 SbfI sites for sequencing sample preparation
restrictionSites = {re.compile('CCA......TGG'):1,
                   re.compile('GCT.AGC'):1,
                   re.compile('CCTGCAGG'):0}

def matchREsites(sequence, REdict):
    seq = sequence.upper()
#iGEM in python 3 dict.iteritems is not present. So replace with dict.items        
#    for resite, numMatchesExpected in restrictionSites.iteritems():
    for resite, numMatchesExpected in restrictionSites.items():
        if len(resite.findall(seq)) != numMatchesExpected:
            return False
        
    return True

def checkOverlaps(leftPosition, acceptedLeftPositions, nonoverlapMin):
    for pos in acceptedLeftPositions:
        if abs(pos - leftPosition) < nonoverlapMin:
            return False
    return True

In [None]:
#flanking sequences
upstreamConstant = 'CCACCTTGTTG'
downstreamConstant = 'GTTTAAGAGCTAAGCTG'

#minimum overlap between two sgRNAs targeting the same TSS
nonoverlapMin = 3

#number of sgRNAs to pick per gene/TSS
sgRNAsToPick = 10

#iGEM TODO need to enable 21_genome as well. temporarily disabled

#list of off-target filter (or combinations of filters) levels, matching the names in the alignment table above
offTargetLevels = [['31_nearTSS', '21_genome'],
                  ['31_nearTSS'],
                  ['21_genome'],
                  ['31_2_nearTSS'],
                  ['31_3_nearTSS']]

#offTargetLevels = [ ['31_nearTSS'],
#                  ['31_2_nearTSS'],
#                  ['31_3_nearTSS']]


#for each gene/TSS, go through each sgRNA in descending order of predicted score
#if an sgRNA passes the restriction site, overlap, and off-target filters, accept it into the library
#if the number of sgRNAs accepted is less than sgRNAsToPick, reduce off-target stringency by one and continue
v2Groups = v2Table.groupby([('library table v2','gene'),('library table v2','transcripts')])
newSgIds = []
unfinishedTss = []
for (gene, transcript), group in v2Groups:
    geneSgIds = []
    geneLeftPositions = []
    empiricalSgIds = dict()
    
    stringency = 0
#iGEM use sort_values instead of sort    
    while len(geneSgIds) < sgRNAsToPick and stringency < len(offTargetLevels):
        for sgId_v2, row in group.sort_values(('predicted score','predicted score'), ascending=False).iterrows():
            oligoSeq = upstreamConstant + row[('library table v2','sequence')] + downstreamConstant
            leftPos = row[('sgRNA info', 'position')] - (23 if row[('sgRNA info', 'strand')] == '-' else 0)
            if len(geneSgIds) < sgRNAsToPick and row['off-target filters'].loc[offTargetLevels[stringency]].all() \
                and matchREsites(oligoSeq, restrictionSites) \
                and checkOverlaps(leftPos, geneLeftPositions, nonoverlapMin):
                geneSgIds.append((sgId_v2,
                                  gene,transcript,
                                  row[('library table v2','sequence')], oligoSeq,
                                  row[('predicted score','predicted score')], np.nan,
                                 stringency))
                geneLeftPositions.append(leftPos)
                
        stringency += 1
            
    if len(geneSgIds) < sgRNAsToPick:
        unfinishedTss.append((gene, transcript)) #if the number of accepted sgRNAs is still less than sgRNAsToPick, discard gene
    else:
        newSgIds.extend(geneSgIds)
        
libraryTable_complete = pd.DataFrame(newSgIds, columns = ['sgID', 'gene', 'transcript','protospacer sequence', 'oligo sequence',
 'predicted score', 'empirical score', 'off-target stringency']).set_index('sgID')

In [None]:
unfinishedTss

In [None]:
#number of sgRNAs accepted at each stringency level
#iGEM newLibraryTable is not defined
#newLibraryTable.groupby('off-target stringency').agg(len).iloc[:,0]
libraryTable_complete.groupby('off-target stringency').agg(len).iloc[:,0]

In [None]:
#number of TSSs with fewer than required number of sgRNAs (and thus not included in the library)
print (len(unfinishedTss))

In [None]:
libraryTable_complete