1. Learning sgRNA predictors from empirical data
    * Load scripts and empirical data
    * Generate TSS annotation using FANTOM dataset
    * Calculate parameters for empirical sgRNAs
    * Fit parameters
2. Applying machine learning model to predict sgRNA activity
    * Find all sgRNAs in genomic regions of interest 
    * Predicting sgRNA activity
3. Construct sgRNA libraries
    * Score sgRNAs for off-target potential
* Pick the top sgRNAs for a library, given predicted activity scores and off-target filtering
* Design negative controls matching the base composition of the library
* Finalizing library design

# 1. Learning sgRNA predictors from empirical data
## Load scripts and empirical data

In [None]:
#modify below variable to point to the base directory containing large genome data files
GENES_LIST_TO_COMPARE = ['AP2A2','EIF4G1','EIF4G2','EIF4G3','FMR1','FXR1','FXR2','GNB2L1','IDE','METTL3','PRRC2A','PRRC2B','PRRC2C','SFPQ','YTHDF1','YTHDF2','YTHDF3','ZCCHC6']

LARGE_FILE_DIR= '../../../../data/'

FASTA_FILE_OF_GENOME= LARGE_FILE_DIR + 'large_data_files/hg19.fa'
GTF_FILE_FROM_GENCODE = LARGE_FILE_DIR + 'large_data_files/gencode.v19.annotation.gtf'

TSS_TABLE_PATH='data_files/human_tssTable.txt'
P1P2_TABLE_PATH='data_files/human_p1p2Table.txt'

FANTOM_TSS_ANNOTATION_BED= LARGE_FILE_DIR + 'large_data_files/TSS_human.sorted.bed.gz'
HGNC_SYMBOL_LOOKUP_TABLE= LARGE_FILE_DIR + 'large_data_files/hgnc_complete_set_2020-08-01.txt'

#spreadsheet containing the lab experiment data to train the model
IGEM_EXCEL_FILE= LARGE_FILE_DIR + 'large_data_files/Sam_Perli_qRT_pcr_data_per_gene.xlsx'


PICKLE_FILE = 'igem_v1_estimator'
TRANSFORMED_PARAM_HEADER='igem_v1_transformed_param_header'

PREDICTED_SCORE_TABLE='igem_v1_predicted_score_table'
TEMP_FASTQ_FILE='igem_v1_temp_fastq_file'


%run sgRNA_learning.py

In [None]:
#read the experiment data
df=pd.read_excel(open(IGEM_EXCEL_FILE, 'rb'),
              sheet_name='Sheet1')  
df =df.dropna( subset = ['sgID'])

df_research = df


In [None]:
libraryTable_research = df[['sgID','gene','transcript','protospacer sequence']]
libraryTable_research= libraryTable_research.loc[1:,].set_index('sgID').rename(columns={'protospacer sequence':'sequence','transcript':'transcripts'})

#Select all genes except last one
libraryTable_subset = libraryTable_research.loc[libraryTable_research['gene'].isin(GENES_LIST_TO_COMPARE)]
#Read sgID
sgInfoTable = parseAllSgIds(libraryTable_subset)

#Read scores measured in the lab
normedScores = df[['sgID','gene','Measured by Dr. Perli']].loc[1:,]
normedScores = normedScores.loc[normedScores['gene'].isin(GENES_LIST_TO_COMPARE)]

#normedScores['Measured by Dr. Perli']= np.log10(1/normedScores['Measured by Dr. Perli'])
#For the CRISPRi scores measured in the Dr. Perli's lab with qRT-PCR lower the score better the guide, but in our model we will be predicting in such a way that better the score better the guide
#Subtract the scores measured in lab from 1
normedScores['Measured by Dr. Perli']= 1-normedScores['Measured by Dr. Perli']
normedScores = normedScores [['sgID','Measured by Dr. Perli']].set_index('sgID').rename(columns={'Measured by Dr. Perli':''})


# 2. Applying machine learning model to predict sgRNA activity

In [None]:
#starting from a new session for demonstration purposes:
#Here we load our trained model
import _pickle as cPickle

#load tssTable, p1p2Table, genome sequence, chromatin data
tssTable = pd.read_csv(TSS_TABLE_PATH,sep='\t', index_col=[0,1])

p1p2Table = pd.read_csv(P1P2_TABLE_PATH,sep='\t', header=0, index_col=[0,1])
p1p2Table['primary TSS'] = p1p2Table['primary TSS'].apply(lambda tupString: (int(float(tupString.strip('()').split(', ')[0])), int(float(tupString.strip('()').split(', ')[1]))))
p1p2Table['secondary TSS'] = p1p2Table['secondary TSS'].apply(lambda tupString: (int(float(tupString.strip('()').split(', ')[0])),int(float(tupString.strip('()').split(', ')[1]))))

genomeDict = loadGenomeAsDict(FASTA_FILE_OF_GENOME)

bwhandleDict = {'dnase':BigWigFile(open(LARGE_FILE_DIR + 'large_data_files/wgEncodeOpenChromDnaseK562BaseOverlapSignalV2.bigWig','rb')),
'faire':BigWigFile(open(LARGE_FILE_DIR + 'large_data_files/wgEncodeOpenChromFaireK562Sig.bigWig','rb')),
'mnase':BigWigFile(open(LARGE_FILE_DIR + 'large_data_files/wgEncodeSydhNsomeK562Sig.bigWig','rb'))}

#load sgRNA prediction model saved after the parameter fitting step
with open(PICKLE_FILE,'rb') as infile:
    fitTable, estimators, scaler, reg, (geneFold_train, geneFold_test) = cPickle.load(infile)
    
#transformedParamHeader = pd.read_csv(TRANSFORMED_PARAM_HEADER,sep='\t')

#iGEM read the binary file
#transformedParamHeader = pd.read_csv(TRANSFORMED_PARAM_HEADER,sep='\t')

with open(TRANSFORMED_PARAM_HEADER,'rb') as paraminfile:
    transformedParamHeader = cPickle.load(paraminfile)

transformedParams_train = transformedParamHeader

## Find all sgRNAs in genomic regions of interest 

In [None]:
#For IGEM for now try to score the training data itself.

#Select excluded gene fron training set
libraryTable_new = libraryTable_subset
#libraryTable_subset = libraryTable_research
sgInfoTable_new = parseAllSgIds(libraryTable_new)

In [None]:
libraryTable_new.head()


## Predicting sgRNA activity

In [None]:
#calculate parameters for new sgRNAs
paramTable_new = generateTypicalParamTableEx(libraryTable_new, sgInfoTable_new, tssTable, p1p2Table, genomeDict, bwhandleDict)

In [None]:
#transform and predict scores according to sgRNA prediction model
transformedParams_new = transformParams(paramTable_new, fitTable, estimators)

#reconcile any differences in column headers generated by automated binning
colTups = []
for (l1, l2), col in transformedParams_new.iteritems():
    colTups.append((l1,str(l2)))
transformedParams_new.columns = pd.MultiIndex.from_tuples(colTups)
#iGEM in python 3 .loc with missing column headers is giving issues. So changing it to reindex # can use transformedParams_train if running sequentially otherwise use transformedParamHeader after running above step
#predictedScores_new = pd.Series(reg.predict(scaler.transform(transformedParams_new.loc[:, transformedParamHeader.columns].fillna(0).values)), index=transformedParams_new.index)
#predictedScores_new = pd.Series(reg.predict(scaler.transform(transformedParams_new.reindex(columns=transformedParamHeader.columns).fillna(0).values)), index=transformedParams_new.index)
predictedScores_new = pd.Series(reg.predict(scaler.transform(transformedParams_new.reindex(columns=transformedParams_train.columns).fillna(0).values)), index=transformedParams_new.index)

In [None]:
predictedScores_new.head()

In [None]:
#Run this cell to create a comparison matrix of the scores measured in the lab: Weissman v iGEM model
compData = df_research
compData=compData.set_index('sgID')

compData['Measured by Dr. Perli']= 1-compData['Measured by Dr. Perli']

#compData['predicted score'] = compData['empirical score'] if compData['empirical score']>0 else compData['predicted score']

compData['predicted score']  = np.where(compData['empirical score']>0, compData['empirical score'],compData['predicted score'])

compData = compData.loc[predictedScores_new.index]
compData['Dr. Perli rank']=compData.groupby(['gene'])['Measured by Dr. Perli'].rank()
compData['iGem score']=predictedScores_new
compData['iGem rank']=compData.groupby(['gene'])['iGem score'].rank(ascending=False)
compData.filter(regex='sgId')
compData= compData.filter(regex='gene|.*rank|.*score|Me.*|.*rank')
compData


In [None]:
from sklearn.metrics import mean_tweedie_deviance

print ('Compare Deviance of Weissman and iGem From Lab Scores Lower the value, better the model\n')
compStudy=pd.DataFrame(columns=['Gene','Weissman','iGEM'])
for gene in GENES_LIST_TO_COMPARE:
    compGroupData = compData.loc[compData['gene'] == gene] 
    lab_scores =compGroupData['Measured by Dr. Perli'].values
    weissman_scores = compGroupData['predicted score'].values
    iGem_scores = compGroupData['iGem score'].values
    compStudy=compStudy.append({'Gene':gene, 'Weissman':mean_tweedie_deviance(lab_scores, weissman_scores,power=0),'iGEM':mean_tweedie_deviance(lab_scores, iGem_scores,power=0)},ignore_index=True)
    
print(compStudy)    
    


In [None]:
compStudy