In [1]:
import joblib
import random
import numpy as np
import pandas as pd

from deap import base, creator, tools, algorithms

# 0) User input

In [2]:
Params_File = '../data-Example1-Pput/00000000_Example1-Pput_Promoter-Activity_ML_RF3-Params.pkl'
Regressor_File = '../data-Example1-Pput/00000000_Example1-Pput_Promoter-Activity_ML_RF3-Regressor.pkl'
Sequences_file = '../Example1-Pput.csv' # 'ExamplePput_PromSelMid20.csv'#'../Example1-Pput.csv'

# 1) Load models

In [3]:


# Unpickle regressor and scaler
myRegr = joblib.load(Regressor_File)
myParams = joblib.load(Params_File)

mySequences = pd.read_csv(Sequences_file, sep=';')
mySequences['Sequence'] = mySequences['Sequence'].str.upper()
# mySequences = mySequences.sort_values('Promoter Activity', ascending=False)
mySequences = mySequences.drop_duplicates('Sequence')

In [4]:
shortSeqs = np.delete(np.array(list(mySequences['Sequence'].apply(list))), myParams['Positions_removed'], axis=1)
LetterSeqsShort = shortSeqs.copy()
shortSeqs[shortSeqs == 'A'] = 0
shortSeqs[shortSeqs == 'C'] = 1
shortSeqs[shortSeqs == 'G'] = 2
shortSeqs[shortSeqs == 'T'] = 3
mySequences['Sequence_short_Letter'] = [''.join(x) for x in LetterSeqsShort.tolist()]
mySequences['Sequence_short_encoded'] = shortSeqs.astype(int).tolist()
mySequences['Sequence_short_encoded'] = mySequences['Sequence_short_encoded'].apply(tuple)

mySequences = mySequences.drop_duplicates('Sequence_short_encoded')

RefNum = 6
myRefSeqs = mySequences['Sequence_short_encoded'].iloc[0:RefNum].tolist()
RefFull = mySequences['Sequence'].iloc[0:RefNum].tolist()
myRefLets = mySequences['Sequence_short_Letter'].iloc[0:RefNum]

In [19]:
# Save number of nukleotides the regressor uses as input, this is required to specifiy the number 
# of optimization variables
# nNukleotides = myRegr.support_vectors_.shape[1] - 1
nNukleotides = myRegr.n_features_ - 1
nPositions = int(nNukleotides/4)
getattr(myRegr, 'n_features_')

61

# 2) Define and solve optimization problem and algorithm

## Define fitness function

In [8]:
# Convert categorical encoding to letter encoding
def toLetter(myList):
    Letters = {0:'A', 1:'C', 2:'G',3:'T'}
    myLet = [Letters.get(x,x) for x in myList]
    
    return myLet

def toCat(myLetters):
    Cats = {'A':0, 'C':1, 'G':2, 'T':3}
    myCats = [Cats.get(x,x) for x in myLetters]
    
    return myCats

# Convert the integer encoding of the nukleotides used by the GA into a one-hot encoding
def decode(individual):
    gene = list()
    for i in individual:
        if i == 0:
            gene += [1, 0, 0, 0]
        elif i == 1:
            gene += [0, 1, 0, 0]
        elif i == 2:
            gene += [0, 0, 1, 0]
        elif i == 3:
            gene += [0, 0, 0, 1]
            
    return gene


def evaluation(individual):
    gene = decode(individual)
    
    # Calculate the gc share and append it to the input
    gc_share =0
    for i in range(0,nNukleotides,4):
        gc_share += gene[i+1] + gene[i+2]

    gc_share /= nNukleotides
    
    regressor_input = gene + [gc_share]
    
    expression = myRegr.predict([regressor_input])
    
    return expression[0]

def feasible(individual):
    ######## Check if individual is already known ########
    if tuple(individual) in list(mySequences['Sequence_short_encoded']):
        return False
    
    
    ######## Check if individual has high expression ########
    gene = decode(individual)

    expression = evaluation(individual)

    if expression != 1:
        return False
    
    return True

def distance(individual, RefSeqs):
    RefNum = np.array(RefSeqs, ndmin=2).shape[0]
    d = np.sum(np.not_equal([individual]*RefNum, RefSeqs))    
    return (d,)

def SequenceSinglePredFull(SeqPred, RefFull, Positions_removed):
    '''
    The optimization results in a sequence list for positions that were used as features in the prediction. Thus, additional sequence elements have to be added that where removed because of insufficient diversity.
    '''
    # Extracting feature positions from RefFull, i.e. deleting with Positions_removed
    RefNum = len(RefFull)
    RefFull_ar = np.reshape(np.array([Let for Seq in np.array(RefFull) for Let in Seq]),(RefNum,-1))
    SeqRef = np.delete(RefFull_ar,Positions_removed, axis=1)
    
    # converting sequences to categorical
    CatPred = toCat(SeqPred)
    CatRef = [toCat(XRef) for XRef in SeqRef]
    # Take the sequence closest to the predicted one.
    myDist = [distance(CatPred, XRef) for XRef in CatRef]
    
    # The positions will be copied from the closest reference sequence to the predicted sequence.
    PredSeqTemp = np.array([Letter for Letter in RefFull[np.argmin(myDist)]])
    
    # Position_removed contains the indices of positions that were not used for activity prediction.     
    Pos_Test = np.delete(np.reshape(np.arange(0,RefFull_ar.shape[1]),(-1,1)), myParams['Positions_removed'])

    # Replacing the predicted sequence into the reference sequence
    PredSeqTemp[Pos_Test] = [Letter for Letter in SeqPred]
    PredSeq = ''.join(PredSeqTemp)

    return PredSeq

In [9]:
toolbox = base.Toolbox()

###################### Define individuals and poopulation ##########################

# Define type of fitness function (weight=-1 => minimization)
creator.create("FitnessMax", base.Fitness, weights=(-1.0,))

# Define container that represents individual (individual is a list and has the defined fitness)
creator.create("Individual", list, fitness=creator.FitnessMax)

# Define how individual is created (individual object is filled with nPosition random integers that represent the
# nukleotides)
toolbox.register("attr_int", random.randint, 0, 3)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_int, nPositions)

# Define how population is created (population is a list of individuals)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

###################### Set fucntions for GA steps ##########################

# Set selection function (selTournament: randomly select tournsize individuals and select the best one as parent)
# The selection function is later repeated n times in each generation to generate n parents 
toolbox.register("select", tools.selTournament, tournsize=3)

# Set mating function ( cxUniform: takes two parents and transforms them into two childs by iterating over the
# positions and swapping the nukleotides between the parents with a probability of indpb at each position)
toolbox.register("mate", tools.cxUniform, indpb=0.5)

# Set mutation function (mutUniformInt: mutate a child by iterating over its positions and assigning a new
# nukleotide with probability indpb)
toolbox.register("mutate", tools.mutUniformInt, low=0, up=3, indpb=0.1)

# Set fitness function
toolbox.register("evaluate", distance, RefSeqs=myRefSeqs)
# Add constraint handling ()
toolbox.decorate("evaluate", tools.DeltaPenalty(feasible, 1000.0))

###################### Define statistics to be evaluated at each generation ##########################
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

In [10]:
# Create initial population
pop = toolbox.population(n=1000)

# Create hall of fame object that keeps track of the best individual
hof = tools.HallOfFame(5)

# Perform GA
# cxpb: probability that two parents mate (if they do they are discared and their child kept, otherwise they 
#       are kept)
# mutpb: probability that a child is mutated
# ngen: number of generations(=iterations)
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.5, ngen=30, 
                               stats=stats, halloffame=hof, verbose=True)

myOptSeq = [''.join(SSeq) for SSeq in np.array([toLetter(Hofi) for Hofi in hof])]

print("\n\n")
print("Best gene sequence features: ", myOptSeq[0])
print('Best promoter sequence total: ', SequenceSinglePredFull(myOptSeq[0], RefFull, myParams['Positions_removed']))
print("Expression:", evaluation(hof[0]))


gen	nevals	avg    	std    	min	max 
0  	1000  	814.943	372.467	39 	1000
1  	739   	585.318	465.032	32 	1000
2  	740   	313.722	416.389	29 	1000
3  	771   	170.666	307.821	29 	1000
4  	752   	146.274	286.258	20 	1000
5  	757   	133.37 	275.974	23 	1000
6  	754   	141.44 	294.171	17 	1000
7  	732   	105.387	249.302	14 	1000
8  	726   	105.68 	256.598	11 	1000
9  	741   	127.64 	294.091	11 	1000
10 	732   	120.103	288.463	11 	1000
11 	782   	135.737	311.609	11 	1000
12 	767   	141.005	320.25 	11 	1000
13 	746   	161.123	342.718	11 	1000
14 	755   	156.508	338.965	11 	1000
15 	750   	160.355	344.43 	11 	1000
16 	739   	161.653	346.684	11 	1000
17 	759   	146.891	332.728	11 	1000
18 	760   	151.072	338.281	11 	1000
19 	763   	134.005	319.839	11 	1000
20 	749   	150.49 	338.515	11 	1000
21 	729   	116.615	299.391	11 	1000
22 	760   	144.195	332.317	11 	1000
23 	757   	151.928	340.789	11 	1000
24 	746   	108.472	288.895	11 	1000
25 	720   	123.176	308.299	11 	1000
26 	770   	117.045	300.858	1

In [11]:
PredSeq = [SequenceSinglePredFull(mySeq, RefFull, myParams['Positions_removed']) for mySeq in myOptSeq]
PredSeq



['GCCCATTGACAACGCTCTCGCGGCCAGGTATAATTGCACG',
 'GCCCATTGACGAGGCTCTCGCGGCCAGGTATAATTGCACG',
 'GCCCATTGACAAGGCTCTCGCGGCCAGGTATTATTGCACG',
 'GCCCATTGACAAGGTTCTCGCGGCCAGGTATAATTGCACG',
 'GCCCATTGACAAGGCTCTCGCGGCTAGGTATAATTGCACG']