# Optimization of sequence to expression

## Introduction
The previous notebooks guide in the development of ML tools to predict the expression strength of a sequence. For practical purpose of bioengineering it is desirable to predict a sequence based on a target expression.

## System initiation
Loading all required libraries

In [1]:
import os
import numpy as np
import pandas as pd
import joblib
import pickle

from ExpressionExpert_Functions import init_Exp2, Data_Src_Load, make_GAtoolbox


### Variable setting

We load the naming conventions from 'config.txt'

In [2]:
Name_Dict = init_Exp2('config_Pput.txt')

File_Base = Name_Dict['Data_File'].split('.')[0]
Data_Folder = 'data-{}'.format(File_Base) 
ML_Date = Name_Dict['ML_Date']
ML_Regressor = Name_Dict['ML_Regressor'][:-1]
ML_Type = Name_Dict['ML_Regressor'][-1]
Y_Col_Name = eval(Name_Dict['Y_Col_Name'])
Response_Value = eval(Name_Dict['Response_Value'])

Already existent data directory  data-Example1-Pput .


## Loading training data

In [3]:
SeqDat = Data_Src_Load(Name_Dict)

Following outliers were detected: ID: ['BGSPL14g_19_a'], Value: [[50.13234789]]
Categorization of expression.
The expression values were sorted into the following bins: [ 0.2178722  16.5660553  24.76999231 35.15239853]


## Target expression parameters

In [4]:
# Target expression, as measured
Target_Express = np.array([10, 20, 30], dtype=int)

# Number of reference sequences for each target expression
RefNum = 5

YCNum = 0
ML_TargetCol = '{}_ML'.format(Y_Col_Name[YCNum])
Seq_LetterCol = '{}_letter-encrypted'.format(Name_Dict['Sequence_column'])
Seq_CategoCol = '{}_label-encrypted'.format(Name_Dict['Sequence_column'])
SeqRef = pd.DataFrame(columns=[Name_Dict['ID_Col_Name'], Seq_LetterCol, Seq_CategoCol, Y_Col_Name[YCNum], ML_TargetCol])
for ExpVal in Target_Express:
    # Find Target_Express in activity measurement and extract the actual ML optimization
    SeqIdx = np.argpartition(np.ravel(np.array(np.abs(SeqDat[Y_Col_Name]-ExpVal))), RefNum)[:RefNum]
    SeqRef = SeqRef.append(SeqDat.loc[SeqIdx, [Name_Dict['ID_Col_Name'], Seq_LetterCol, Seq_CategoCol, Y_Col_Name[YCNum], ML_TargetCol]], ignore_index=True)
    
print('Extracted reference sequences')

Extracted reference sequences


In [5]:
Measure_Name = ML_TargetCol
# loading correct ML regressor file and parameters for data preparation
Regressor_File = os.path.join(Data_Folder, '{}_{}_{}_{}{}-Regressor.pkl'.format(ML_Date, File_Base, Measure_Name.replace(' ','-'), ML_Regressor, Response_Value))
Parameter_File = os.path.join(Data_Folder, '{}_{}_{}_{}{}-Params.pkl'.format(ML_Date, File_Base, Measure_Name.replace(' ','-'), ML_Regressor, Response_Value))

try:
#         ML_DictName = (Measure_Name)
    myRegr = joblib.load(Regressor_File)
    # I assume the parameters have been generated in the same run as the regressor itself and is located in the same directory following the default naming scheme
    myParams = pickle.load(open(Parameter_File,'rb'))
    # extracting the positions that were removed because of insufficient information content
    Positions_removed = myParams['Positions_removed']
    # if the data was standardized we load the corresponding function
    if Response_Value == 0:
        # loading standard scaler
        Scaler_File = os.path.join(Data_Folder, '{}_{}_{}-Scaler.pkl'.format(time.strftime('%Y%m%d'), File_Base, Name_Dict['ML_Regressor']))
        Expr_Scaler = pickle.load(open(Scaler_File,'rb'))
        # The standard scaler default name is the name of the expression measurement column with suffix: '_Scaler'
        Scaler_DictName = '{}_Scaler'.format(Y_Col_Name[YCNum])
#             Expr_Scaler[Scaler_DictName] = Data_Prep_Params[Scaler_DictName]
except FileNotFoundError:
    print('Regressor file not found. Check parameter "ML_Date" in "config.txt"')

# Save number of nukleotides the regressor uses as input, this is required to specifiy the number 
# of optimization variables
# nNukleotides = myRegr.support_vectors_.shape[1] - 1
if Name_Dict['ML_Regressor'] == 'SVC' or Name_Dict['ML_Regressor'] == 'SVR':
    nNukleotides = myRegr.support_vectors_.shape[1] - 1
else:
    nNukleotides = myRegr.n_features_ - 1
    
nPositions = int(nNukleotides/4)
nPositions

15

In [6]:
# np.delete(np.array(list(mySequences['Sequence'].apply(list))), myParams['Positions_removed'], axis=1)
asd = np.delete(np.array(list(SeqRef[Seq_CategoCol])), Positions_removed, axis=1)

myRefSeqs = list(asd[-5:])
hof, _, _ = make_GAtoolbox(nPositions, myRefSeqs, npop=1000, ngen=30)

NameError: name 'myRefSeqs' is not defined