In [1]:
import joblib
# import utils
import random
import numpy
import timeit

from deap import base, creator, tools, algorithms

# 0) User input

In [2]:
# Regressor_File = '../data-PromLib_EcolPtai/00000000_PromLib_EcolPtai_Ecol-Promoter-Activity_SVR-Regressor.pkl' #'SVR-Regressor-Standard.pkl'
# Scaler_File = '../data-PromLib_EcolPtai/00000000_PromLib_EcolPtai_Ecol-Promoter-Activity_SVR-Params.pkl'#'SVR-Scaler.pkl'
Scaler_File = '../data-Example1-Pput/00000000_Example1-Pput_Promoter-Activity_ML_SV3-Params.pkl'
Regressor_File = '../data-Example1-Pput/00000000_Example1-Pput_Promoter-Activity_ML_SV3-Regressor.pkl'


# 1) Load models

In [4]:
myScaler

{'Positions_removed': array([ 0,  1,  2,  3,  4,  7,  8,  9, 11, 13, 15, 17, 19, 22, 23, 27, 28,
        29, 33, 34, 35, 36, 37, 38, 39])}

In [3]:
# Unpickle regressor and scaler
myRegr = joblib.load(Regressor_File)
myScaler = joblib.load(Scaler_File)

# Create dictionary containing the output scaler
# scaler = myScaler['Promoter Activity_ML_Scaler']
# scaler = myScaler['Ecol Promoter Activity_Scaler']




In [None]:
myScaler['Position_removed']

In [None]:
# Save number of nukleotides the regressor uses as input, this is required to specifiy the number 
# of optimization variables
nNukleotides = myRegr.support_vectors_.shape[1] - 1
nPositions = int(nNukleotides/4)

# 2) Define and solve optimization problem and algorithm

## Define fitness function

In [None]:
# Convert the integer encoding of the nukleotides used by the GA into a one-hot encoding
def decode(individual):
    gene = list()
    for i in individual:
        if i == 0:
            gene += [1, 0, 0, 0]
        elif i == 1:
            gene += [0, 1, 0, 0]
        elif i == 2:
            gene += [0, 0, 1, 0]
        elif i == 3:
            gene += [0, 0, 0, 1]
            
    return gene

def evaluation(individual):
    gene = decode(individual)
    
    # Calculate the gc share and append it to the input
    gc_share =0
    for i in range(0,nNukleotides,4):
        gc_share += gene[i+1] + gene[i+2]

    gc_share /= nNukleotides
    
    regressor_input = gene + [gc_share]
    
    expression = myRegr.predict([regressor_input])

    expression = scaler.inverse_transform(expression)
    
    return (expression[0],)

In [None]:
toolbox = base.Toolbox()

###################### Define individuals and poopulation ##########################

# Define type of fitness function (weight=1 => maximization)
creator.create("FitnessMax", base.Fitness, weights=(1.0,))

# Define container that represents individual (individual is a list and has the defined fitness)
creator.create("Individual", list, fitness=creator.FitnessMax)

# Define how individual is created (individual object is filled with nPosition random integers that represent the
# nukleotides)
toolbox.register("attr_int", random.randint, 0, 3)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_int, nPositions)

# Define how population is created (population is a list of individuals)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

###################### Set fucntions for GA steps ##########################

# Set selection function (selTournament: randomly select tournsize individuals and select the best one as parent)
# The selection function is later repeated n times in each generation to generate n parents 
toolbox.register("select", tools.selTournament, tournsize=3)

# Set mating function ( cxUniform: takes two parents and transforms them into two childs by iterating over the
# positions and swapping the nukleotides between the parents with a probability of indpb at each position)
toolbox.register("mate", tools.cxUniform, indpb=0.5)

# Set mutation function (mutUniformInt: mutate a child by iterating over its positions and assigning a new
# nukleotide with probability indpb)
toolbox.register("mutate", tools.mutUniformInt, low=0, up=3, indpb=0.05)

# Set fitness function
toolbox.register("evaluate", evaluation)

###################### Define statistics to be evaluated at each generation ##########################
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", numpy.mean)
stats.register("std", numpy.std)
stats.register("min", numpy.min)
stats.register("max", numpy.max)

In [None]:
# Create initial population
pop = toolbox.population(n=300)

# Create hall of fame object that keeps track of the best individual
hof = tools.HallOfFame(1)

start_time = timeit.default_timer()
# Perform GA
# cxpb: probability that two parents mate (if they do they are discared and their child kept, otherwise they 
#       are kept)
# mutpb: probability that a child is mutated
# ngen: number of generations(=iterations)
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=40, 
                               stats=stats, halloffame=hof, verbose=True)

run_time = timeit.default_timer() - start_time

print('Optimization run time: {:.0f} sec'.format(run_time))
print("\n\n")
print("Best gene sequence:", decode(hof[0]))
print("Expression:", evaluation(hof[0])[0])

In [None]:
len(decode(hof[0]))