In [1]:
import numpy as np
import sklearn.model_selection as skms
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

def mlr_r2(X,y):
    model = LinearRegression()
    model.fit(X, y)
    # compute with formulas from the theory
    yhat = model.predict(X)
    SS_Residual = sum((y-yhat)**2)
    SS_Total = sum((y-np.mean(y))**2)
    r_squared = 1 - (float(SS_Residual))/SS_Total
    adjusted_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
    return r_squared, adjusted_r_squared


def mlr(x, y):
    columnnames = list(x.columns.values)
    npones = np.ones(len(y), float)
    A_sl = x.values
    A = np.column_stack([A_sl, npones])
    lstsq, residuals, rank, something = np.linalg.lstsq(A, y,rcond=-1)
    degfreedom = y.size - 1

    r2 = 1 - residuals / (y.size * y.var())
    r2adj = 1 - (((1 - r2) * degfreedom) / (y.size - rank - 2))
    RMSE = np.sqrt(1 - r2) * np.std(y)

    return lstsq, rank, r2, r2adj, RMSE



def mlrr(x, y):
    '''
    get the multiple linear regression coefficients by making a numpy 
    matrix and taking np.linalg.lstsq 
    '''
    npones = np.ones(len(x), float)
    A_sl = x.values
    A = np.column_stack([A_sl, npones])
    lstsq, residuals, rank, something = np.linalg.lstsq(A, y,rcond=-1)
    ym=float(y.mean())
    SStot_list=[(float(y) - ym)**2 for y in y.values]
    SStot=sum(SStot_list)
    #r2 = 1 - float(residuals)/SStot
    return lstsq, rank, len(A[0]) #,r2adj, RMSE
def pmlr(x, y):
    npones = np.ones(len(y), float)
    A = np.column_stack([x, npones])
    lstsq = np.dot(np.linalg.pinv(A), y)
    return lstsq


def kfoldmlr(xi, yi, **kwargs):
    '''gives the y-hats for a q2LOO calculation'''
    x = xi.values
    y = yi.values
    nfolds=kwargs["nfolds"]
    mean=kwargs["mean"]
    kf = skms.KFold(n_splits=nfolds)  # indices=None, shuffle=False, random_state=None)
    y_hats = []
    print(kf)
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        coefficients = mlrr(x_train, y_train)[0]
        resids = mlrr(x_train, y_train)[1]
        y_hats.append(resids)
    # for e in y_hats:
    #    cleanyhats.append(float(e))
    stack = np.asarray(y_hats)
    if mean==True:
        return np.mean(stack)
    else:
        return stack


def kfoldmlrplot(xi, yi, **kwargs):
    '''gives the y-hats for a q2LOO calculation'''
    x = xi.values
    y = yi.values
    nfolds=kwargs["nfolds"]
    kf = skms.KFold(n_splits=nfolds)  # indices=None, shuffle=False, random_state=None)
    y_hats = []
    print(kf)
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        coefficients = mlrr(x_train, y_train)[0]
        resids = mlrr(x_train, y_train)[1]
        plt.plot(x_train, y_train, 'o', label='Original data', markersize=5)
        plt.plot(x_train, coefficients[0]*x_train + coefficients[1], 'r', label='Fitted line')
        plt.legend()
        plt.show()


In [2]:
#mlr(x,y)

In [3]:
mlrr(x,y)

NameError: name 'x' is not defined

In [4]:
import copy as cp
import functools
import itertools as itert
import multiprocessing
import random
import numpy as np
from deap import creator, base, tools, algorithms
from numpy.random import RandomState



def hash_ind_list(i):
    return hash(tuple(i))


randomnum = np.random.uniform(1, 100, 1)
# arguments:  ngen, basetable, y, popsize, indsize, crossoverrate, #mutprob, evaluation function, selection function
toolbox = base.Toolbox()


class GAdescsel():
    def __init__(self, basetable, y, ngen=1000, popsize=100, indsize=5, cx=.5, mut=.05, seed=int(12345)):
        '''
        initialize GA object, giving ngen, popsize, indsize, 
        crossover rate, mutation rate, and random seed initialization
        '''
        creator.create("Fitness", base.Fitness, weights=(1.0,))

        creator.create("Individual", list, fitness=creator.Fitness, __hash__=hash_ind_list)
        creator.create("Population", list, fitness=creator.Fitness, __hash__=hash_ind_list)
        # toolbox=base.Toolbox()
        global toolbox
        # global evalq2loo
        self.seed = seed
        self.basetable = basetable
        self.y = y
        self.ngen = ngen
        self.popsize = popsize
        self.indsize = indsize
        self.cx = cx
        self.mut = mut
        pool = multiprocessing.Pool()

    def ct_calls(func):
        '''
        this is a decorator function to count the number of calls to self.mkeinseed.count so that we can make the
        the random number generator use a different seed each time (increases by one each time)
        :return: number of times mkeinseed has been called
        '''

        @functools.wraps(func)
        def decor(*args, **kwargs):
            decor.count += 1
            return func(*args, **kwargs)

        decor.count = 0
        return decor

    def mkeindrand(self, desc_in_ind=5):
        '''
        :param desc_in_ind: number of descriptors in model ("individual" in deap)
        :return: a random sample
        '''
        while str(type(self.basetable)) != "<class 'pandas.core.frame.DataFrame'>":
            raise TypeError("The type of descriptor table should be a Pandas dataframe.")
        while type(desc_in_ind) is not int:
            try:
                print
                "converting non-int to int"
                desc_in_ind = int(desc_in_ind)
                break
            except:
                raise ValueError("The number of descriptors per individual should be of type int")
        print(random.sample(set(self.basetable.columns),5))
        smple = random.sample(set(self.basetable.columns), desc_in_ind)

        return smple

    @ct_calls
    def mkeindseed(self, desc_in_ind=5):
        if self.mkeindseed.count <= 100:
            prng = RandomState(self.seed + int(self.mkeindseed.count))
        if self.mkeindseed.count > 100:
            prng = RandomState(self.seed + int((self.mkeindseed.count % 100)))
        smple = prng.choice(self.basetable.columns, size=desc_in_ind, replace=False)
        return list(smple)

    def mutaRan(self, ind):
        # mutpool=[str(i) for i in ndesc.index if i not in ind]
        for descriptor in ind:
            if np.random.binomial(1, self.mut, 1) == 1:
                choices = [x for x in list(self.basetable.columns) if x not in ind]
                ind[ind.index(descriptor)] = random.choice(choices)
        return ind,

    def evalr2(self, ind):
        return mlr_r2(self.basetable[ind], self.y)[0],

    def evalr2adj(self, ind):
        return mlr(self.basetable[ind], self.y)[3].astype(float),

    def evalq2loo(self, ind):
        #        print self.basetable[ind][1]
        return q2loo_mlr(self.basetable[ind], self.y),

    def printq2fitness(self, pop):
        #this needs rewriting
        q2s = []
        for ind in pop:
            q2s.append(IQSAR.mlr3.q2loo_mlr(self.basetable[ind], self.y))
        return q2s


    def evolve(self, evalfunc="q2loo"):

        toolbox.register("genind", self.mkeindrand, self.indsize)
        toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.genind)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual, n=self.popsize)

        if evalfunc == "q2loo":
            toolbox.register("evaluate", self.evalq2loo)
        elif evalfunc == "r2":
            toolbox.register("evaluate", self.evalr2)
        elif evalfunc == "r2adj":
            toolbox.register("evaluate", self.evalr2adj)
        else:
            raise ValueError("not a valid evaluation function specified; use evalr2adj, evalr2, or q2loo")

        toolbox.register("mate", tools.cxOnePoint)  # Uniform, indpb=0.5)
        toolbox.register("mutate", self.mutaRan)  # , indpb=self.mut)
        toolbox.register("select", tools.selBest)
        origpop = toolbox.population()
        # self.mkeindseed.count=0
        population = cp.deepcopy(origpop)
        #we are evaluating the fitness function on the population
        fits = toolbox.map(toolbox.evaluate, population)
        for fit, ind in zip(fits, population):
            ind.fitness.values = fit
        avgfitnesses = []
    
        for gen in range(self.ngen):
            try:
                offspring = algorithms.varOr(population, toolbox, lambda_=self.popsize, cxpb=self.cx, mutpb=self.mut)
                fits=toolbox.map(toolbox.evaluate, offspring)
                for fit, ind in zip(fits, offspring):
                    ind.fitness.values = fit
                population = toolbox.select(offspring + population, k=100)
                
                    #print(itert.groupby(sorted(offspring + population)))
                #population = toolbox.select([k for k, v in itert.groupby(sorted(offspring + population))], k=100)
                
                #prb.animate(gen)
            except (KeyboardInterrupt, SystemExit):
                return [origpop, toolbox.map(toolbox.evaluate, origpop), population,
                        toolbox.map(toolbox.evaluate, population)]
            #except:
                #return [origpop, toolbox.map(toolbox.evaluate, origpop), population,
                        #toolbox.map(toolbox.evaluate, population)]
        #1st element returned is the original population, 2nd is is the evaluation of the firness function on the original
        #popualtion, 3rd is the final population, 4th is the evalution of the fitness function on the final population.
        return [origpop, toolbox.map(toolbox.evaluate, origpop), population, toolbox.map(toolbox.evaluate, population)]
        print("Done!")

    def get_df(self, chosenind):
        btt = self.basetable[chosenind]

        print("r2 is: ", mlr.mlr(btt, self.y)[2], 
            "r2adj is: ", mlr.mlr(btt, self.y)[3], 
            "q2loo is: ", mlr.q2loo_mlr(btt, self.y))
        print("coefficients are:", m.mlr(btt, self.y)[0])
        return btt

    def debug_eval(self):
        toolbox.register("evaluate", evalr2, self.y, self.basetable)
        toolbox.register("mate", tools.cxOnePoint)  # Uniform, indpb=0.5)
        toolbox.register("mutate", mutRan, indpb=self.mut)
        toolbox.register("select", tools.selBest)
        population = toolbox.population()
        fits = toolbox.map(toolbox.evaluate, population)

        for fit, ind in zip(fits, population):
            ind.fitness.values = fit
        offspring = algorithms.varOr(population, toolbox, lambda_=100, cxpb=.5, mutpb=.05)
        for ind in offspring:
            ind.fitness.values = toolbox.evaluate(ind)
            print(ind)
            print(ind.fitness.values)

In [None]:

try:
    from IPython.core.display import clear_output
    have_ipython = True
except ImportError:
    have_ipython = False

class ProgressBar:
    def __init__(self, iterations):
        self.iterations = iterations
        self.prog_bar = '[]'
        self.fill_char = '*'
        self.width = 40
        self.__update_amount(0)
        if have_ipython:
            self.animate = self.animate_ipython
        else:
            self.animate = self.animate_noipython

    def animate_ipython(self, iter):
        try:
            clear_output()
        except Exception:
            # terminal IPython has no clear_output
            pass
        print('\r', self)
        sys.stdout.flush()
        self.update_iteration(iter + 1)

    def update_iteration(self, elapsed_iter):
        self.__update_amount((elapsed_iter / float(self.iterations)) * 100.0)
        self.prog_bar += '  %dGEN of %sGEN complete' % (elapsed_iter, self.iterations)

    def __update_amount(self, new_amount):
        percent_done = int(round((new_amount / 100.0) * 100.0))
        all_full = self.width - 2
        num_hashes = int(round((percent_done / 100.0) * all_full))
        self.prog_bar = '[' + self.fill_char * num_hashes + ' ' * (all_full - num_hashes) + ']'
        pct_place = (len(self.prog_bar) / 2) - len(str(percent_done))
        pct_string = '%d%%' % percent_done
        self.prog_bar = self.prog_bar[0:pct_place] + \
            (pct_string + self.prog_bar[pct_place + len(pct_string):])



    def animate_noipython(self):
        for i in range(self.duration):
            if sys.platform.lower().startswith('win'):
                print(self, '\r', end=" ")
            else:
                print(self, chr(27) + "[A") 
            self.update_time(i + 1, self.score)
            time.sleep(1) 
        print(self)
        
    def __update_amount_old(self, new_amount,newscore):
        percent_done = int(round((new_amount / 100.0) * 100.0))
        all_full = self.width - 2
        num_hashes = int(round((percent_done / 100.0) * all_full))
        self.prog_bar = '[' + self.fill_char * num_hashes + ' ' * (all_full - num_hashes) + ']'
        #self.score=newscore
        pct_place = (len(self.prog_bar) / 2) - len(str(percent_done))
        pct_string = '%d%%' % percent_done
        self.prog_bar = self.prog_bar[0:pct_place] + \
            (pct_string + self.prog_bar[pct_place + len(pct_string):]+str(self.update_score(self.score)))
        
    def __str__(self):
        return str(self.prog_bar)

In [101]:
#old code
import copy as cp
import functools
import itertools as itert
import multiprocessing
import random
import numpy as np
from deap import creator, base, tools, algorithms
from numpy.random import RandomState


def hash_ind_list(i):
    return hash(tuple(i))


randomnum = np.random.uniform(1, 100, 1)
# arguments:  ngen, basetable, y, popsize, indsize, crossoverrate, #mutprob, evaluation function, selection function
toolbox = base.Toolbox()


class GAdescsel():
    def __init__(self, basetable, y, ngen=1000, popsize=100, indsize=5, cx=.5, mut=.05, seed=int(12345)):
        '''
        initialize GA object, giving ngen, popsize, indsize, 
        crossover rate, mutation rate, and random seed initialization
        '''
        creator.create("Fitness", base.Fitness, weights=(1.0,))

        creator.create("Individual", list, fitness=creator.Fitness, __hash__=hash_ind_list)
        creator.create("Population", list, fitness=creator.Fitness, __hash__=hash_ind_list)
        # toolbox=base.Toolbox()
        global toolbox
        # global evalq2loo
        self.seed = seed
        self.basetable = basetable
        self.y = y
        self.ngen = ngen
        self.popsize = popsize
        self.indsize = indsize
        self.cx = cx
        self.mut = mut
        pool = multiprocessing.Pool()

    def ct_calls(func):
        '''
        this is a decorator function to count the number of calls to self.mkeinseed.count so that we can make the
        the random number generator use a different seed each time (increases by one each time)
        :return: number of times mkeinseed has been called
        '''

        @functools.wraps(func)
        def decor(*args, **kwargs):
            decor.count += 1
            return func(*args, **kwargs)

        decor.count = 0
        return decor

    def mkeindrand(self, desc_in_ind=5):
        '''
        :param desc_in_ind: number of descriptors in model ("individual" in deap)
        :return: a random sample
        '''
        while str(type(self.basetable)) != "<class 'pandas.core.frame.DataFrame'>":
            raise TypeError("The type of descriptor table should be a Pandas dataframe.")
        while type(desc_in_ind) is not int:
            try:
                print
                "converting non-int to int"
                desc_in_ind = int(desc_in_ind)
                break
            except:
                raise ValueError("The number of descriptors per individual should be of type int")
        print(random.sample(set(self.basetable.columns),5))
        smple = random.sample(set(self.basetable.columns), desc_in_ind)

        return smple

    @ct_calls
    def mkeindseed(self, desc_in_ind=5):
        if self.mkeindseed.count <= 100:
            prng = RandomState(self.seed + int(self.mkeindseed.count))
        if self.mkeindseed.count > 100:
            prng = RandomState(self.seed + int((self.mkeindseed.count % 100)))
        smple = prng.choice(self.basetable.columns, size=desc_in_ind, replace=False)
        return list(smple)

    def mutaRan(self, ind):
        # mutpool=[str(i) for i in ndesc.index if i not in ind]
        for descriptor in ind:
            if np.random.binomial(1, self.mut, 1) == 1:
                choices = [x for x in list(self.basetable.columns) if x not in ind]
                ind[ind.index(descriptor)] = random.choice(choices)
        return ind,

    def evalr2(self, ind):
        return mlr_r2(self.basetable[ind], self.y)[0],

    def evalr2adj(self, ind):
        return mlr(self.basetable[ind], self.y)[3].astype(float),

    def evalq2loo(self, ind):
        #        print self.basetable[ind][1]
        return q2loo_mlr(self.basetable[ind], self.y),

    def printq2fitness(self, pop):
        #this needs rewriting
        q2s = []
        for ind in pop:
            q2s.append(IQSAR.mlr3.q2loo_mlr(self.basetable[ind], self.y))
        return q2s

    def pretty_print(self,evolveobj):
        origdf = pd.DataFrame.from_records(evolveobj[0])
        origdf["scores"] = pd.DataFrame.from_records(list(evolveobj[1]))
        finaldf = pd.DataFrame.from_records(evolveobj[2])
        finaldf["scores"] = pd.DataFrame.from_records(list(evolveobj[3]))
        self.evo_o = origdf
        self.evo_f = finaldf
        return origdf, finaldf
    def evolve(self,evalfunc="q2loo"):
     
        toolbox.register("genind", self.mkeindseed, self.indsize)
        toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.genind)
        toolbox.register("population",tools.initRepeat, list, toolbox.individual, n=self.popsize)
        
        if evalfunc=="q2loo":
            toolbox.register("evaluate", self.evalq2loo)
        elif evalfunc=="q2lmo":
            toolbox.register("evaluate", self.evalq2lmo)
        elif evalfunc=="r2":
            toolbox.register("evaluate", self.evalr2)
        elif evalfunc=="r2adj":
            toolbox.register("evaluate", self.evalr2adj)
        else:
            raise ValueError("not a valid evaluation function specified; use r2adj, r2, or q2loo")
        
        toolbox.register("mate", tools.cxOnePoint) #Uniform, indpb=0.5)
        toolbox.register("mutate", self.mutaRan)#, indpb=self.mut)
        toolbox.register("select", tools.selBest)
        #progress bar start!
        #print 'Starting... # GEN FINISHED:',

        origpop=toolbox.population()
        #self.mkeindseed.count=0
        population=cp.deepcopy(origpop)
        fits=toolbox.map(toolbox.evaluate, population)
        for fit, ind in zip(fits,population):
            ind.fitness.values=fit
        
        avgfitnesses=[]
        popfits=0
        #prb=ProgressBar(self.ngen)
        for gen in range(self.ngen):
            try:
                offspring=algorithms.varOr(population, toolbox, lambda_=self.popsize, cxpb=self.cx, mutpb=self.mut)   
                for ind in offspring:
                    ind.fitness.values=toolbox.evaluate(ind)
                population=toolbox.select([k for k,v in itert.groupby(sorted(offspring+population))], k=100)
                popfits = toolbox.map(toolbox.evaluate, population)
                #prb.animate(gen)
                #prb.score=np.mean(popfits)
                #ProgressBar.score=property(lambda self: self.score+np.mean(popfits))
                #prb.update_time(1, prb.score)
            except (KeyboardInterrupt, SystemExit):
                returnobj = [origpop, toolbox.map(toolbox.evaluate, origpop), population, toolbox.map(toolbox.evaluate, population)]
                self.pretty_print(returnobj)
            except:
                returnobj = [origpop, toolbox.map(toolbox.evaluate, origpop), population, toolbox.map(toolbox.evaluate, population)]
                self.pretty_print(returnobj)

    def get_df(self, chosenind):
        btt = self.basetable[chosenind]

        print("r2 is: ", mlr.mlr(btt, self.y)[2], 
            "r2adj is: ", mlr.mlr(btt, self.y)[3], 
            "q2loo is: ", mlr.q2loo_mlr(btt, self.y))
        print("coefficients are:", m.mlr(btt, self.y)[0])
        return btt

    def debug_eval(self):
        toolbox.register("evaluate", evalr2, self.y, self.basetable)
        toolbox.register("mate", tools.cxOnePoint)  # Uniform, indpb=0.5)
        toolbox.register("mutate", mutRan, indpb=self.mut)
        toolbox.register("select", tools.selBest)
        population = toolbox.population()
        fits = toolbox.map(toolbox.evaluate, population)

        for fit, ind in zip(fits, population):
            ind.fitness.values = fit
        offspring = algorithms.varOr(population, toolbox, lambda_=100, cxpb=.5, mutpb=.05)
        for ind in offspring:
            ind.fitness.values = toolbox.evaluate(ind)
            print(ind)
            print(ind.fitness.values)

In [102]:
import pandas as pd

In [103]:
x=pd.DataFrame(np.random.rand(1000,1000))
y=pd.DataFrame(np.random.rand(1000,1))

In [104]:
yt=pd.read_csv("../datasets/liuthyroid/liu_thrb_orig.csv", index_col=0)
xt=pd.read_csv("../datasets/liuthyroid/liu_thrb_lb_dragon6.txt", sep="\t", index_col=0)

In [105]:
yt=yt["Y Exp."].dropna()
xt=xt.dropna()
xt=xt.drop(columns=["NAME"])

In [106]:
#xt[["MW", "Sv"]]

In [107]:
#xt[["MW", "Sv"]]

In [108]:
mlr_r2(xt[["MW", "Sv"]], yt)[0]

0.3018546655738966

In [109]:
ga_obj1=GAdescsel(xt, yt, ngen=1000, popsize=100, indsize=5, cx=.5, mut=.05, seed=int(12345))

In [110]:
new_pop=ga_obj1.evolve(evalfunc = "r2")

In [112]:
new_pop

In [99]:
def pretty_print_test(evolveobj):
        origdf = pd.DataFrame.from_records(evolveobj[0])
        origdf["scores"] = pd.DataFrame.from_records(list(evolveobj[1]))
        #print(tuple(evolveobj[3]))
        #enddf = evolveobj()
        return origdf

In [100]:
pretty_print_test(new_pop)

           0
0   0.469547
1   0.412672
2   0.293461
3   0.387415
4   0.339056
5   0.294888
6   0.307686
7   0.349121
8   0.311896
9   0.314409
10  0.395741
11  0.350265
12  0.149129
13  0.336482
14  0.311667
15  0.452725
16  0.342212
17  0.339820
18  0.330948
19  0.441527
20  0.268712
21  0.165445
22  0.528662
23  0.338904
24  0.274047
25  0.166738
26  0.230101
27  0.356405
28  0.301144
29  0.511916
..       ...
70  0.255767
71  0.315888
72  0.288968
73  0.427972
74  0.330248
75  0.113559
76  0.383075
77  0.178797
78  0.414805
79  0.343746
80  0.191641
81  0.405340
82  0.402911
83  0.308515
84  0.309983
85  0.089137
86  0.255593
87  0.217417
88  0.260178
89  0.419840
90  0.201812
91  0.211632
92  0.278086
93  0.384621
94  0.484823
95  0.332563
96  0.417129
97  0.389907
98  0.221251
99  0.283517

[100 rows x 1 columns]


Unnamed: 0,0,1,2,3,4
0,Eig03_AEA(dm),RDF055m,GATS5e,RDF075m,nArCONR2
1,RDF055m,SpMin1_Bh(v),Mor16m,R1s+,R5m
2,MAXDP,ATSC7e,O%,Mor28p,JGI1
3,H4s,SpMaxA_AEA(dm),SpMax8_Bh(s),RDF090u,Eig09_AEA(dm)
4,HATS1u,GATS5i,RDF155v,T(O..Cl),R8v
5,MATS8m,F02[O-O],F07[C-I],Mor21p,CATS2D_09_AA
6,R1v+,G1u,SpMin6_Bh(m),CATS2D_05_LL,DELS
7,P_VSA_LogP_5,SpMAD_AEA(ed),T(O..Cl),R6u+,RDF155s
8,H5u,RDF115u,RDF100u,L3s,R7i+
9,Eig08_AEA(dm),RDF125v,CATS2D_02_NL,B10[O-O],Eig06_EA(dm)


In [33]:
list(new_pop[0])==list(new_pop[2])

False

In [35]:
new_pop[2]

[['R7p', 'HATS6p', 'SpMax2_Bh(p)', 'E2s', 'SpMaxA_EA(dm)'],
 ['T(N..I)', 'HATS6p', 'SpMax2_Bh(p)', 'E2s', 'SpMaxA_EA(dm)'],
 ['RDF055u', 'SM11_AEA(bo)', 'R8i+', 'SpDiam_EA(dm)', 'TDB05v'],
 ['B05[O-S]', 'HATS6p', 'SpMax2_Bh(p)', 'E2s', 'SpMaxA_EA(dm)'],
 ['Eta_B_A', 'RDF095u', 'J_G', 'B08[N-Br]', 'B07[O-O]'],
 ['SM14_AEA(bo)', 'E3v', 'SpMax1_Bh(v)', 'SpMax4_Bh(s)', 'RDF020v'],
 ['Mor22s', 'TDB03s', 'GATS4s', 'GGI6', 'Mor22m'],
 ['Mor22s', 'TDB03s', 'GATS4s', 'GGI6', 'E2e'],
 ['Mor20s', 'SpMax4_Bh(m)', 'RDF040m', 'RDF065p', 'VE1_D/Dt'],
 ['RDF055u', 'SM11_AEA(bo)', 'R8i+', 'SpDiam_EA(dm)', 'VE2_A'],
 ['SpMax1_Bh(v)', 'SM11_AEA(bo)', 'MATS6s', 'SpMin1_Bh(v)', 'TDB07e'],
 ['Eig03_AEA(dm)', 'RDF055m', 'GATS5e', 'RDF075m', 'nArCONR2'],
 ['SM14_AEA(bo)', 'E3v', 'TDB05v', 'Mor28m', 'Eig05_AEA(dm)'],
 ['F10[C-O]', 'Mor25v', 'SpMax4_Bh(p)', 'SPI', 'R5m'],
 ['TDB06v', 'SM6_B(p)', 'MATS3m', 'Gu', 'F09[C-I]'],
 ['Chi1_EA(dm)', 'TDB02s', 'F04[N-O]', 'E2s', 'SpMaxA_EA(dm)'],
 ['H8m', 'Mor25v', 'SpMa

In [34]:
new_pop[0]

[['Eig03_AEA(dm)', 'RDF055m', 'GATS5e', 'RDF075m', 'nArCONR2'],
 ['RDF055m', 'SpMin1_Bh(v)', 'Mor16m', 'R1s+', 'R5m'],
 ['MAXDP', 'ATSC7e', 'O%', 'Mor28p', 'JGI1'],
 ['H4s', 'SpMaxA_AEA(dm)', 'SpMax8_Bh(s)', 'RDF090u', 'Eig09_AEA(dm)'],
 ['HATS1u', 'GATS5i', 'RDF155v', 'T(O..Cl)', 'R8v'],
 ['MATS8m', 'F02[O-O]', 'F07[C-I]', 'Mor21p', 'CATS2D_09_AA'],
 ['R1v+', 'G1u', 'SpMin6_Bh(m)', 'CATS2D_05_LL', 'DELS'],
 ['P_VSA_LogP_5', 'SpMAD_AEA(ed)', 'T(O..Cl)', 'R6u+', 'RDF155s'],
 ['H5u', 'RDF115u', 'RDF100u', 'L3s', 'R7i+'],
 ['Eig08_AEA(dm)', 'RDF125v', 'CATS2D_02_NL', 'B10[O-O]', 'Eig06_EA(dm)'],
 ['Mor13m', 'E1s', 'R4p', 'RDF025m', 'SpMax6_Bh(s)'],
 ['SpMin2_Bh(e)', 'N-068', 'HATS8v', 'R8e+', 'H8p'],
 ['TDB05p', 'E1v', 'CATS2D_09_LL', 'CATS2D_06_AA', 'HATS6m'],
 ['E3u', 'F04[O-O]', 'GATS1i', 'B10[C-Cl]', 'B02[O-O]'],
 ['C-005', 'R3p', 'MATS6m', 'SpMAD_B(p)', 'GATS2p'],
 ['H8m', 'Mor25v', 'SpMax4_Bh(p)', 'SPI', 'R5m'],
 ['Mor32s', 'RDF020v', 'R7v', 'RDF135m', 'ARR'],
 ['SdssC', 'Xt', 'TDB1