# Optimization of sequence to expression

## Introduction
The previous notebooks guide in the development of ML tools to predict the expression strength of a sequence. For practical purpose of bioengineering it is desirable to predict a sequence based on a target expression.

## System initiation
Loading all required libraries

In [None]:
import os
import numpy as np
import pandas as pd
import joblib
import pickle
import time
import sys
sys.path.append('..')
from exp2ipynb import *


### Variable setting

We load the naming conventions from 'config.txt'

In [None]:
Name_Dict = init_Exp2('config_Pput.txt')

File_Base = Name_Dict['Data_File'].split('.')[0]
Data_Folder = 'data-{}'.format(File_Base) 
ML_Date = Name_Dict['ML_Date']
ML_Regressor = Name_Dict['ML_Regressor'][:-1]
ML_Type = Name_Dict['ML_Regressor'][-1]
Y_Col_Name = eval(Name_Dict['Y_Col_Name'])
Response_Value = eval(Name_Dict['Response_Value'])
Measure_Numb = int(Name_Dict['Library_Expression'])

## Loading training data and ML files

The codecell below loads the training data and the estimators. The estimator is used from the `config.txt` file and the `ML_Date` variable.

In [None]:
SeqDat = Data_Src_Load(Name_Dict)
myRegr = dict()
myParams = dict()

for Meas_Idx in range(Measure_Numb): 
#     ML_TargetCol = '{}_ML'.format(Y_Col_Name[Meas_Idx])
    Measure_Name = '{}_ML'.format(Y_Col_Name[Meas_Idx])
    # loading correct ML regressor file and parameters for data preparation
    Regressor_File = os.path.join(Data_Folder, '{}_{}_{}_{}{}-Regressor.pkl'.format(ML_Date, File_Base, Measure_Name.replace(' ','-'), ML_Regressor, Response_Value))
    Parameter_File = os.path.join(Data_Folder, '{}_{}_{}_{}{}-Params.pkl'.format(ML_Date, File_Base, Measure_Name.replace(' ','-'), ML_Regressor, Response_Value))

    try:
    #         ML_DictName = (Measure_Name)
        myRegr[Meas_Idx] = joblib.load(Regressor_File)
        # I assume the parameters have been generated in the same run as the regressor itself and is located in the same directory following the default naming scheme
        myParams = pickle.load(open(Parameter_File,'rb'))
        # extracting the positions that were removed because of insufficient information content
        Positions_removed = myParams['Positions_removed']
        # if the data was standardized we load the corresponding function
        if Response_Value == 0:
            # loading standard scaler
            Scaler_File = os.path.join(Data_Folder, '{}_{}_{}-Scaler.pkl'.format(time.strftime('%Y%m%d'), File_Base, Name_Dict['ML_Regressor']))
            Expr_Scaler[Meas_Idx] = pickle.load(open(Scaler_File,'rb'))
            # The standard scaler default name is the name of the expression measurement column with suffix: '_Scaler'
            Scaler_DictName[Meas_Idx] = '{}_Scaler'.format(Y_Col_Name[Meas_Idx])
    #             Expr_Scaler[Scaler_DictName] = Data_Prep_Params[Scaler_DictName]
    except FileNotFoundError:
        print('Regressor file not found. Check parameter "ML_Date" in "config.txt"')

    # Save number of nucleotides the regressor uses as input, this is required to specifiy the number 
    # of optimization variables
    # nNucleotides = myRegr.support_vectors_.shape[1] - 1
    if Name_Dict['ML_Regressor'] == 'SVC' or Name_Dict['ML_Regressor'] == 'SVR':
        nNucleotides = myRegr[Meas_Idx].support_vectors_.shape[1] - 1
    else:
        nNucleotides = myRegr[Meas_Idx].n_features_ - 1
    
nPositions = int(nNucleotides/4)

## Genetic optimization for sequence search

**Classification**<br>
Finding new sequence based on an classifier cannot be performed by optimizing expression, because the expression is binned and multiple sequences have identical class membership. Therefore, the sequence distance, i.e., number of nucleotide changes, towards a number of reference promoters is minimized.<br>
`ExpressGoal`: dictionary
 - keys: index of library (`0` if one library and reporter is measured)
 - values: target bin (e.g., `1` for medium expression with three bins)

`ExpressFine`: dictionary
 - keys: index of library (`0` if one library and reporter is measured)
 - values: optimal expression of measured promoters


The number of reference promoters used for the distance calculation is defined in the variable `ReferenceNumber`. The number of new sequences is defined by the variable `TargetNumber`.

In [None]:
ExpressGoal = {0:[1]} #{0:[1], 1:[2]}
ExpressFine = {0:[20]} #{0:[.03], 1:[.04]}
ReferenceNumber = 4
TargetNumber = 3
MyFinal = pd.DataFrame({'Idx-Original':[], 'Strain-ID':[], 'Sequence':[], 'target':[]})
# TargetSeqs = dict({'ID':[], 'Sequence':[], 'Expression': []})

for Meas_Idx in ExpressGoal.keys(): 
#     Measure_Name = '{}_ML'.format(Y_Col_Name[Meas_Idx])
    # extraction of reference sequences, closest expression with respect to ExpressFine input.
    for iGoal, iFine in zip(ExpressGoal[Meas_Idx],ExpressFine[Meas_Idx]):
        MyRefs = ExtractRefSeq(SeqDat, Name_Dict, iFine, ReferenceNumber)
        go = GeneOptimizer()
        myHOF, _ = go.optimize(myRegr[Meas_Idx], ML_Type, MyRefs, SeqDat, Positions_removed, nNucleotides, target_expr=iGoal, hof_size=TargetNumber)

        myOptSeq = [''.join(SSeq) for SSeq in np.array([toLetter(Hofi) for Hofi in myHOF])]
        myOptSeqFull = [SequenceSinglePredFull(OptSeq_i, MyRefs['Sequence'], Positions_removed) for OptSeq_i in myOptSeq]
        myOptExpFull = [evaluation(hof_i, myRegr[Meas_Idx], nNucleotides) for hof_i in myHOF]
        myID =['predicted:{}-{}'.format(iGoal,myCount) for myCount in range(1,TargetNumber+1)]
        TarDict = {'Strain-ID':myID, 'Sequence':myOptSeqFull, 'target':myOptExpFull}

        MyFinal = pd.concat([MyFinal, MyRefs, pd.DataFrame(TarDict)])

    # MyFinal.reset_index()
    Csv_ID = 'Predicted-Target-Promoter'
    TarCsv_File = os.path.join('{}/{}_{}_{}_{}.csv'.format(Data_Folder, time.strftime('%Y%m%d'), File_Base, Csv_ID, Y_Col_Name[Meas_Idx].replace(' ','-')))
    MyFinal.to_csv(TarCsv_File, index=None)
    print('Target sequences saved as: {}'.format(TarCsv_File))