# Optimization of sequence to expression

## Introduction
The previous notebooks guide in the development of ML tools to predict the expression strength of a sequence. For practical purpose of bioengineering it is desirable to predict a sequence based on a target expression.

## System initiation
Loading all required libraries

In [1]:
import os
import numpy as np
import pandas as pd
import joblib
import pickle
import time

from Exp2Ipynb import init_Exp2, Data_Src_Load, ExtractRefSeq, GeneOptimizer, toLetter, SequenceSinglePredFull, evaluation


### Variable setting

We load the naming conventions from 'config.txt'

In [2]:
Name_Dict = init_Exp2('config_Pput.txt')

File_Base = Name_Dict['Data_File'].split('.')[0]
Data_Folder = 'data-{}'.format(File_Base) 
ML_Date = Name_Dict['ML_Date']
ML_Regressor = Name_Dict['ML_Regressor'][:-1]
ML_Type = Name_Dict['ML_Regressor'][-1]
Y_Col_Name = eval(Name_Dict['Y_Col_Name'])
Response_Value = eval(Name_Dict['Response_Value'])
Measure_Numb = int(Name_Dict['Library_Expression'])

Already existent data directory  data-Example1-Pput .


## Loading training data and ML files

In [3]:
# YCNum = 0
# ML_TargetCol = '{}_ML'.format(Y_Col_Name[YCNum])
# Measure_Name = ML_TargetCol
# # loading correct ML regressor file and parameters for data preparation
# Regressor_File = os.path.join(Data_Folder, '{}_{}_{}_{}{}-Regressor.pkl'.format(ML_Date, File_Base, Measure_Name.replace(' ','-'), ML_Regressor, Response_Value))
# Parameter_File = os.path.join(Data_Folder, '{}_{}_{}_{}{}-Params.pkl'.format(ML_Date, File_Base, Measure_Name.replace(' ','-'), ML_Regressor, Response_Value))

SeqDat = Data_Src_Load(Name_Dict)
myRegr = dict()
myParams = dict()

for Meas_Idx in range(Measure_Numb): 
#     ML_TargetCol = '{}_ML'.format(Y_Col_Name[Meas_Idx])
    Measure_Name = '{}_ML'.format(Y_Col_Name[Meas_Idx])
    # loading correct ML regressor file and parameters for data preparation
    Regressor_File = os.path.join(Data_Folder, '{}_{}_{}_{}{}-Regressor.pkl'.format(ML_Date, File_Base, Measure_Name.replace(' ','-'), ML_Regressor, Response_Value))
    Parameter_File = os.path.join(Data_Folder, '{}_{}_{}_{}{}-Params.pkl'.format(ML_Date, File_Base, Measure_Name.replace(' ','-'), ML_Regressor, Response_Value))

    try:
    #         ML_DictName = (Measure_Name)
        myRegr[Meas_Idx] = joblib.load(Regressor_File)
        # I assume the parameters have been generated in the same run as the regressor itself and is located in the same directory following the default naming scheme
        myParams = pickle.load(open(Parameter_File,'rb'))
        # extracting the positions that were removed because of insufficient information content
        Positions_removed = myParams['Positions_removed']
        # if the data was standardized we load the corresponding function
        if Response_Value == 0:
            # loading standard scaler
            Scaler_File = os.path.join(Data_Folder, '{}_{}_{}-Scaler.pkl'.format(time.strftime('%Y%m%d'), File_Base, Name_Dict['ML_Regressor']))
            Expr_Scaler[Meas_Idx] = pickle.load(open(Scaler_File,'rb'))
            # The standard scaler default name is the name of the expression measurement column with suffix: '_Scaler'
            Scaler_DictName[Meas_Idx] = '{}_Scaler'.format(Y_Col_Name[Meas_Idx])
    #             Expr_Scaler[Scaler_DictName] = Data_Prep_Params[Scaler_DictName]
    except FileNotFoundError:
        print('Regressor file not found. Check parameter "ML_Date" in "config.txt"')

    # Save number of nucleotides the regressor uses as input, this is required to specifiy the number 
    # of optimization variables
    # nNucleotides = myRegr.support_vectors_.shape[1] - 1
    if Name_Dict['ML_Regressor'] == 'SVC' or Name_Dict['ML_Regressor'] == 'SVR':
        nNucleotides = myRegr[Meas_Idx].support_vectors_.shape[1] - 1
    else:
        nNucleotides = myRegr[Meas_Idx].n_features_ - 1
    
nPositions = int(nNucleotides/4)

Following outliers were detected: ID: ['BGSPL14g_19_a'], Value: [[50.13234789]]
Categorization of expression.
The expression values were sorted into the following bins: [ 0.2178722  16.5660553  24.76999231 35.15239853]


In [None]:
MyRefs = ExtractRefSeq(SeqDat, Name_Dict, .05, 3)
mytmp = np.delete(np.vstack(SeqDat['Sequence_label-encrypted'].values), Positions_removed, axis=1).tolist()
mytst = mytmp[0]
# mytst = list(np.ones(len(mytmp[0]), dtype=int))
if mytst in mytmp:
    print('exists')
else:
    print('new')
mytst
# SeqDat['Sequence_label-encrypted'].drop_duplicates().apply(tuple)
# tuple()
# from Exp2Ipynb import distance, GeneOptimizer
# go = GeneOptimizer()
# go
# myseq = np.array(go._reference_sequences, ndmin=2, dtype=int)
# distance(go._reference_sequences[0], go._reference_sequences)
# go._reference_sequences

In [4]:
ExpressGoal = {0:[1]} #{0:[1], 1:[2]}
ExpressFine = {0:[20]} #{0:[.03], 1:[.04]}
ReferenceNumber = 2
TargetNumber = 3
MyFinal = pd.DataFrame({'Idx-Original':[], 'Strain-ID':[], 'Sequence':[], 'target':[]})
# TargetSeqs = dict({'ID':[], 'Sequence':[], 'Expression': []})

for Meas_Idx in ExpressGoal.keys(): 
#     Measure_Name = '{}_ML'.format(Y_Col_Name[Meas_Idx])
    # extraction of reference sequences, closest expression with respect to ExpressFine input.
    for iGoal, iFine in zip(ExpressGoal[Meas_Idx],ExpressFine[Meas_Idx]):
        MyRefs = ExtractRefSeq(SeqDat, Name_Dict, iFine, ReferenceNumber)
        go = GeneOptimizer()
        myHOF, _ = go.optimize(myRegr[Meas_Idx], ML_Type, MyRefs, SeqDat, Positions_removed, nNucleotides, target_expr=iGoal, hof_size=TargetNumber)

        myOptSeq = [''.join(SSeq) for SSeq in np.array([toLetter(Hofi) for Hofi in myHOF])]
        myOptSeqFull = [SequenceSinglePredFull(OptSeq_i, MyRefs['Sequence'], Positions_removed) for OptSeq_i in myOptSeq]
        myOptExpFull = [evaluation(hof_i, myRegr[Meas_Idx], nNucleotides) for hof_i in myHOF]
        myID =['predicted:{}-{}'.format(iGoal,myCount) for myCount in range(1,TargetNumber+1)]
        TarDict = {'Strain-ID':myID, 'Sequence':myOptSeqFull, 'target':myOptExpFull}

        MyFinal = pd.concat([MyFinal, MyRefs, pd.DataFrame(TarDict)])

    # MyFinal.reset_index()
    Csv_ID = 'Predicted-Target-Promoter'
    TarCsv_File = os.path.join('{}/{}_{}_{}_{}.csv'.format(Data_Folder, time.strftime('%Y%m%d'), File_Base, Csv_ID, Y_Col_Name[Meas_Idx].replace(' ','-')))
    MyFinal.to_csv(TarCsv_File, index=None)
    print('Target sequences saved as: {}'.format(TarCsv_File))

gen	nevals	avg   	std    	min	max 
0  	300   	827.11	373.235	15 	1000
1  	178   	559.6 	486.883	15 	1000
2  	159   	200.457	378.834	10 	1000
3  	195   	65.03  	206.877	10 	1000
4  	183   	46.34  	167.73 	9  	1000
5  	194   	38     	148.709	9  	1000
6  	176   	29.5833	126.358	6  	1000
7  	193   	34.6067	149.232	5  	1000
8  	205   	26.3067	126.782	4  	1000
9  	193   	37.92  	169.203	4  	1000
10 	171   	46.58  	194.623	3  	1000
11 	165   	65.3267	236.144	3  	1000
12 	170   	81.2667	264.739	3  	1000
13 	191   	84.0467	270.102	3  	1000
14 	148   	70.42  	248.442	3  	1000
15 	193   	96.77  	289.797	3  	1000
16 	175   	63.2233	236.674	3  	1000
17 	171   	53.04  	217.248	3  	1000
18 	181   	46.3767	202.96 	3  	1000
19 	168   	29.7167	160.604	3  	1000
20 	161   	43.0633	195.336	3  	1000
21 	185   	43.0467	195.338	3  	1000
22 	175   	13.13  	99.186 	3  	1000
23 	167   	26.4867	150.475	3  	1000
24 	174   	33.0867	170.046	3  	1000
25 	155   	29.7633	160.596	3  	1000
26 	189   	49.6533	210.264	3  	