# Workflow to feed the regressors with input

The input is originaly derived from a nucleotide sequence with for possible bases (A,C,G,T) plus a value of the GC content in percent. The sequence is converted into a one-hot encoding where each position is represented by a vector of size four, and a `1` at a specific position encodes the base. For example, `[1,0,0,0]` represents `A`, etc. For each sequence position there is a one-hot encoding, and all encodings are concatenated. Thus the input size is four times the original sequence length.

In [None]:
import numpy as np
import joblib

In [None]:
def make_Input(SeqLen=17):
    import numpy as np
    import random 
    # finding the '1' in the one-hot encoding for each position
    # randint is inclusive the upper limit
    RandBinInput = [random.randint(0,3) for _ in range(SeqLen)]
    # summing out the absolute position
    myOnes = [np.sum(np.array([mult*4, pos])) for mult, pos in enumerate(RandBinInput)]
    # print(myOnes)
    MyInp = np.zeros(SeqLen*4)
    MyInp[myOnes] = 1
    # adding GC content float
    MyInp = np.array([np.append(MyInp, round(random.gauss(.5,.1),2))])
    return MyInp

In [None]:
minp = np.vstack([make_Input(22) for _ in range(100)])

Regressor_File = '../data-PromLib_EcolPtai/00000000_PromLib_EcolPtai_Ecol-Promoter-Activity_SVR-Regressor.pkl'#'RegressorTests/SVR-Regressor-Standard.pkl'
Scaler_File = '../data-PromLib_EcolPtai/00000000_PromLib_EcolPtai_Ecol-Promoter-Activity_SVR-Params.pkl'#'RegressorTests/SVR-Scaler.pkl'

# Regressor_File = 'SVR-Regressor-Standard.pkl'
myRegr = joblib.load(Regressor_File)
myExpr = myRegr.predict(minp)

# Scaler_File = 'SVR-Scaler.pkl'
myScal = joblib.load(Scaler_File)
Scaler = myScal['Ecol Promoter Activity_Scaler']

In [None]:
import matplotlib.pyplot as plot
%matplotlib inline
plot.hist(myExpr)


In [None]:
optinp = np.array([0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
optmat = np.reshape(optinp,(-1,4))
Bases = np.array(['A','C','G','T'])
OptSeq = list(np.hstack([Bases[vec==1] for vec in optmat]))
GC_content = (OptSeq.count('G')+OptSeq.count('C'))/len(OptSeq)
print('Sequence: {}, GC-content: {:.2f}'.format(OptSeq,GC_content))
optall = np.hstack([optinp,GC_content])
OptExpr_stand = myRegr.predict([optall])
Scaler.inverse_transform(OptExpr_stand)

## Classification ML

Classification leads to more robust prediction compared to regression when the sample volume and data quality is insufficient.

In [None]:
minp = np.vstack([make_Input(15) for _ in range(1000)])

Regressor_File = 'RFC_Ex.pkl'
myRegr = joblib.load(Regressor_File)
myExpr = myRegr.predict(minp)
plot.hist(myExpr, bins=3)
