# Workflow to feed the regressors with input

The input is originaly derived from a nucleotide sequence with for possible bases (A,C,G,T) plus a value of the GC content in percent. The sequence is converted into a one-hot encoding where each position is represented by a vector of size four, and a `1` at a specific position encodes the base. For example, `[1,0,0,0]` represents `A`, etc. For each sequence position there is a one-hot encoding, and all encodings are concatenated. Thus the input size is four times the original sequence length.

In [None]:
def make_Input(SeqLen=17):
    import numpy as np
    from random import randint
    # finding the '1' in the one-hot encoding for each position
    # randint is inclusive the upper limit
    RandBinInput = [randint(0,3) for _ in range(SeqLen)]
    # summing out the absolute position
    myOnes = [np.sum(np.array([mult*4, pos])) for mult, pos in enumerate(RandBinInput)]
    # print(myOnes)
    MyInp = np.zeros(SeqLen*4)
    MyInp[myOnes] = 1
    # adding GC content float
    MyInp = np.array([np.append(MyInp, round(random.gauss(.5,.1),2))])
    return MyInp

In [None]:
import joblib

minp = make_Input(17)

Regressor_File = 'SVR-StrainA.pkl'
myRegr = joblib.load(Regressor_File)
myRegr.predict(minp)