Merge branch 'master' into lexicase_survival

lacava · Aug 1, 2017 · 2b6b9a4 · 2b6b9a4
2 parents 439bf05 + bee67c4
commit 2b6b9a4
Show file tree

Hide file tree

Showing 13 changed files with 348 additions and 296 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ analysis/
 *.ipynb
 *.so
 *.o
+*.cpp
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 
 # Few
 
-**Few** is a **Feature Engineering Wrapper** for sci-kitlearn. Few looks for a set of feature transformations that work best with a specified machine learning algorithm in order to improve model estimation and prediction. In doing so, Few is able to provide the user with a set of concise, engineered features that describe their data.
+**Few** is a **Feature Engineering Wrapper** for scikit-learn. Few looks for a set of feature transformations that work best with a specified machine learning algorithm in order to improve model estimation and prediction. In doing so, Few is able to provide the user with a set of concise, engineered features that describe their data.
 
 Few uses genetic programming to generate, search and update engineered features. It incorporates feedback from the ML process to select important features, while also scoring them internally. 
 

diff --git a/few/_version.py b/few/_version.py
@@ -6,4 +6,4 @@
 
 """
 
-__version__ = '0.0.37'
+__version__ = '0.0.44'
diff --git a/few/evaluation.py b/few/evaluation.py
@@ -106,7 +106,7 @@ class EvaluationMixin(object):
     'separation': lambda y,yhat: 1 - separation(yhat,y),
     'fisher': lambda y,yhat: 1 - fisher(yhat,y),
     'accuracy': lambda y,yhat: 1 - accuracy_score(yhat,y),
-    'random': lambda y,yhat: np.random.rand(),
+    'random': lambda y,yhat: self.random_state.rand(),
     'roc_auc': lambda y,yhat: 1 - roc_auc_score(y,yhat)
     # 'relief': lambda y,yhat: 1-ReliefF(n_jobs=-1).fit(yhat.reshape(-1,1),y).feature_importances_
     }
@@ -122,7 +122,7 @@ class EvaluationMixin(object):
     'separation': lambda y,yhat: 1 - separation(yhat,y,samples=True),
     'fisher': lambda y,yhat: 1 - fisher(yhat,y,samples=True),
     'accuracy': lambda y,yhat: 1 - np.sum(yhat==y)/y.shape[0], # this looks wrong, CHECK
-    'random': lambda y,yhat: np.random.rand(len(y)),
+    'random': lambda y,yhat: self.random_state.rand(len(y)),
     # 'relief': lambda y,yhat: 1-ReliefF(n_jobs=-1,sample_scores=True).fit(yhat.reshape(-1,1),y).feature_importances_
     }
 
@@ -137,7 +137,7 @@ class EvaluationMixin(object):
     # 'separation':  1 - separation(yhat,y,samples=True),
     # 'fisher':  1 - fisher(yhat,y,samples=True),
     # 'accuracy':  1 - np.sum(yhat==y)/y.shape[0],
-    # 'random':  np.random.rand(len(y)),
+    # 'random':  self.random_state.rand(len(y)),
     # # 'relief':  1-ReliefF(n_jobs=-1,sample_scores=True).fit(yhat.reshape(-1,1),y).feature_importances_
     # }
 

diff --git a/few/few.py b/few/few.py
diff --git a/few/lib/evaluation.h b/few/lib/evaluation.h
@@ -0,0 +1,46 @@
+/* evaluation c++ code
+Copyright 2016 William La Cava
+
+This file is part of the FEW library.
+
+The FEW library is free software: you can redistribute it and/or
+modify it under the terms of the GNU General Public License as published by the
+Free Software Foundation, either version 3 of the License, or (at your option)
+any later version.
+
+The FEW library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+You should have received a copy of the GNU General Public License along with
+the FEW library. If not, see http://www.gnu.org/licenses/.
+*/
+#include <iostream>
+#include "Eigen/Dense"
+#include <Python.h>
+using namespace Eigen;
+using namespace std;
+
+/*  DEFINE Custom Type Names to make code more readable
+    ExtMat :  2-dim matrix/array externally defined (in Python)
+*/
+typedef Map<ArrayXXd> ExtMat;
+typedef ArrayXXd Mat;
+typedef ArrayXd Vec;
+
+
+void evaluate(node n, ExtMat& features, vector<Vec> stack_float, vector<Vec> stack_bool)
+{
+  //evalute a program node on a given set of data.
+  ExtMat F (features, n, d);
+
+  vector<float> stack_float;
+  vector<float> stack_bool;
+  for (auto n: program){
+    // evaluate program nodes on stack
+    evaluate(n,F,stack_float,stack_bool);
+  }
+}
+
+void out(vector<node> program, ExtMat& features, char otype){
+  // evaluate program output.
+}
diff --git a/few/lib/few_lib.pyx b/few/lib/few_lib.pyx
@@ -1,4 +1,4 @@
-# distutils: language = c++
+# distutils: language=c++
 from eigency.core cimport *
 cimport numpy as np
 from libcpp.vector cimport vector

diff --git a/few/population.py b/few/population.py
@@ -11,6 +11,7 @@
 import uuid
 from mdr import MDR
 from collections import defaultdict
+import itertools as it
 
 eqn_dict = {
     '+': lambda n,stack_eqn: '(' + stack_eqn.pop() + '+' + stack_eqn.pop() + ')',
@@ -130,7 +131,7 @@ def __init__(self,fitness = -1.0,stack = None):
 
 class Pop(object):
     """class representing population"""
-    def __init__(self,pop_size=100,n_samples=1, fit = None):
+    def __init__(self,pop_size=100, fit = None):
         """initializes population of inds of size pop_size"""
 
         self.individuals = []
@@ -141,46 +142,120 @@ def __init__(self,pop_size=100,n_samples=1, fit = None):
             else:
                 self.individuals.append(Ind(fitness = fit))
 
-def stacks_2_eqns(stacks):
-    """returns equation strings from stacks"""
-    if stacks:
-        return list(map(lambda p: stack_2_eqn(p), stacks))
-    else:
+class PopMixin(object):
+    """methods for constructing features."""
+    ######################################################## printing equations
+    def eval_eqn(self,n,stack_eqn):
+        if len(stack_eqn) >= n.arity['f']+n.arity['b']:
+            stack_eqn.append(eqn_dict[n.name](n,stack_eqn))
+
+    def stack_2_eqn(self,p):
+        """returns equation string for program stack"""
+        stack_eqn = []
+        if p: # if stack is not empty
+            for n in p.stack:
+                self.eval_eqn(n,stack_eqn)
+            return stack_eqn[-1]
         return []
 
-def stack_2_eqn(p):
-    """returns equation string for program stack"""
-    stack_eqn = []
-    if p: # if stack is not empty
-        for n in p.stack:
-            eval_eqn(n,stack_eqn)
-        return stack_eqn[-1]
-    return []
-
-def eval_eqn(n,stack_eqn):
-    if len(stack_eqn) >= n.arity['f']+n.arity['b']:
-        stack_eqn.append(eqn_dict[n.name](n,stack_eqn))
-        # if any(np.isnan(stack_eqn[-1])) or any(np.isinf(stack_eqn[-1])):
-        #     print("problem operator:",n)
-
-def make_program(stack,func_set,term_set,max_d,ntype):
-    """makes a program stack"""
-    # print("stack:",stack,"max d:",max_d)
-    if max_d == 0: #or np.random.rand() < float(len(term_set))/(len(term_set)+len(func_set)):
-        ts = [t for t in term_set if t.out_type==ntype]
-
-        if not ts:
-            raise ValueError('no ts. ntype:'+ntype+'. term_set out_types:'+','.join([t.out_type for t in term_set]))
-
-        stack.append(ts[np.random.choice(len(ts))])
-    else:
-        fs = [f for f in func_set if (f.out_type==ntype and (f.in_type=='f' or max_d>1))]
-        if len(fs)==0:
-            print('ntype:',ntype,'\nfunc_set:',[f.name for f in func_set])
-        stack.append(fs[np.random.choice(len(fs))])
-        tmp = copy.copy(stack[-1])
-
-        for i in np.arange(tmp.arity['f']):
-            make_program(stack,func_set,term_set,max_d-1,'f')
-        for i in np.arange(tmp.arity['b']):
-            make_program(stack,func_set,term_set,max_d-1,'b')
+    def stacks_2_eqns(self,stacks):
+        """returns equation strings from stacks"""
+        if stacks:
+            return list(map(lambda p: self.stack_2_eqn(p), stacks))
+        else:
+            return []
+
+    ########################################################### making programs
+    def make_program(self,stack,func_set,term_set,max_d,ntype):
+        """makes a program stack"""
+        # print("stack:",stack,"max d:",max_d)
+        if max_d == 0:
+            ts = [t for t in term_set if t.out_type==ntype]
+
+            if not ts:
+                raise ValueError('no ts. ntype:'+ntype+'. term_set out_types:'+
+                                 ','.join([t.out_type for t in term_set]))
+
+            stack.append(ts[np.random.choice(len(ts))])
+        else:
+            fs = [f for f in func_set if (f.out_type==ntype
+                                          and (f.in_type=='f' or max_d>1))]
+            if len(fs)==0:
+                print('ntype:',ntype,'\nfunc_set:',[f.name for f in func_set])
+            stack.append(fs[np.random.choice(len(fs))])
+            tmp = copy.copy(stack[-1])
+
+            for i in np.arange(tmp.arity['f']):
+                self.make_program(stack,func_set,term_set,max_d-1,'f')
+            for i in np.arange(tmp.arity['b']):
+                self.make_program(stack,func_set,term_set,max_d-1,'b')
+
+    def init_pop(self):
+        """initializes population of features as GP stacks."""
+        pop = Pop(self.population_size)
+        seed_with_raw_features = False
+        # make programs
+        if self.seed_with_ml:
+            # initial population is the components of the default ml model
+            if (self.ml_type == 'SVC' or self.ml_type == 'SVR'):
+                # this is needed because svm has a bug that throws valueerror
+                #on attribute check
+                seed_with_raw_features=True
+            elif (hasattr(self.ml.named_steps['ml'],'coef_') or
+                  hasattr(self.ml.named_steps['ml'],'feature_importances_')):
+                # add model components with non-zero coefficients to initial
+                # population, in order of coefficient size
+                coef = (self.ml.named_steps['ml'].coef_ if
+                        hasattr(self.ml.named_steps['ml'],'coef_') else
+                        self.ml.named_steps['ml'].feature_importances_)
+                # compress multiple coefficients for each feature into single
+                # numbers (occurs with multiclass classification)
+                if len(coef.shape)>1:
+                    coef = [np.mean(abs(c)) for c in coef.transpose()]
+
+                # remove zeros
+                coef = [c for c in coef if c!=0]
+                # sort feature locations based on importance/coefficient
+                locs = np.arange(len(coef))
+                locs = locs[np.argsort(np.abs(coef))[::-1]]
+                for i,p in enumerate(pop.individuals):
+                    if i < len(locs):
+                        p.stack = [node('x',loc=locs[i])]
+                    else:
+                        # make program if pop is bigger than n_features
+                        self.make_program(p.stack,self.func_set,self.term_set,
+                                     self.random_state.randint(self.min_depth,
+                                                       self.max_depth+1),
+                                     self.otype)
+                        p.stack = list(reversed(p.stack))
+            else:
+                seed_with_raw_features = True
+            # seed with random features if no importance info available
+            if seed_with_raw_features:
+                for i,p in enumerate(pop.individuals):
+                    if i < self.n_features:
+                        p.stack = [node('x',
+                                        loc=np.random.randint(self.n_features))]
+                    else:
+                        # make program if pop is bigger than n_features
+                        self.make_program(p.stack,self.func_set,self.term_set,
+                                     self.random_state.randint(self.min_depth,
+                                                       self.max_depth+1),
+                                     self.otype)
+                        p.stack = list(reversed(p.stack))
+
+            # print initial population
+            if self.verbosity > 2:
+                print("seeded initial population:",
+                      self.stacks_2_eqns(pop.individuals))
+
+        else: # don't seed with ML
+            for I in pop.individuals:
+                depth = self.random_state.randint(self.min_depth,
+                                                  self.max_depth+1)
+                self.make_program(I.stack,self.func_set,self.term_set,depth,
+                             self.otype)
+                # print(I.stack)
+                I.stack = list(reversed(I.stack))
+
+        return pop
diff --git a/few/selection.py b/few/selection.py
@@ -9,7 +9,6 @@
 import copy
 import pdb
 from sklearn.metrics import r2_score
-from .population import stacks_2_eqns
 from few_lib import ep_lex
 # from profilehooks import profile
 
@@ -36,7 +35,7 @@ def survival(self,parents,offspring,elite=None,elite_index=None,X=None,F=None,F_
             survivors, survivor_index = self.deterministic_crowding(parents,offspring,X,X_offspring)
         elif self.sel == 'random':
             # pdb.set_trace()
-            survivor_index = np.random.permutation(np.arange(2*len(parents)))[:len(parents)]
+            survivor_index = self.random_state.permutation(np.arange(2*len(parents)))[:len(parents)]
             survivors = [(parents + offspring)[s] for s in survivor_index]
         # elitism
         if self.elitism:
@@ -58,7 +57,7 @@ def tournament(self,individuals,tourn_size, num_selections=None):
 
         for i in np.arange(num_selections):
             # sample pool with replacement
-            pool_i = np.random.choice(len(individuals),size=tourn_size)
+            pool_i = self.random_state.choice(len(individuals),size=tourn_size)
             pool = []
             for i in pool_i:
                 pool.append(np.mean(individuals[i].fitness))
@@ -92,7 +91,7 @@ def lexicase(self,individuals, num_selections=None, epsilon = False, survival =
             candidates = individuals
             can_locs = range(len(individuals))
             cases = list(np.arange(len(individuals[0].fitness_vec)))
-            np.random.shuffle(cases)
+            self.random_state.shuffle(cases)
             # pdb.set_trace()
             while len(cases) > 0 and len(candidates) > 1:
                 # get best fitness for case among candidates
@@ -105,7 +104,7 @@ def lexicase(self,individuals, num_selections=None, epsilon = False, survival =
                 candidates,can_locs = zip(*((x,l) for x,l in zip(candidates,can_locs) if x.fitness_vec[cases[0]] == best_val_for_case))
                 cases.pop(0)
 
-            choice = np.random.randint(len(candidates))
+            choice = self.random_state.randint(len(candidates))
             winners.append(copy.deepcopy(candidates[choice]))
             locs.append(can_locs[choice])
             if survival: # filter out winners from remaining selection pool
@@ -137,7 +136,7 @@ def epsilon_lexicase(self, F, sizes, num_selections=None, survival = False):
 
                 can_locs = individual_locs
                 cases = list(np.arange(F.shape[1]))
-                np.random.shuffle(cases)
+                self.random_state.shuffle(cases)
                 # pdb.set_trace()
                 while len(cases) > 0 and len(can_locs) > 1:
                     # get best fitness for case among candidates
@@ -146,7 +145,7 @@ def epsilon_lexicase(self, F, sizes, num_selections=None, survival = False):
                     can_locs = [l for l in can_locs if F[l,cases[0]] <= best_val_for_case + mad_for_case[cases[0]]]
                     cases.pop(0)
 
-                choice = np.random.randint(len(can_locs))
+                choice = self.random_state.randint(len(can_locs))
                 locs.append(can_locs[choice])
                 if survival: # filter out winners from remaining selection pool
                     individual_locs = [i for i in individual_locs if i != can_locs[choice]]

diff --git a/few/tests/test_evaluation.py b/few/tests/test_evaluation.py
@@ -42,7 +42,8 @@ def test_out_shapes():
     pop_size = 5;
     few = FEW(population_size=pop_size,seed_with_ml=False)
     few.term_set = term_set
-    pop = few.init_pop(n_features)
+    few.n_features = n_features
+    pop = few.init_pop()
 
     pop.X = np.asarray(list(map(lambda I: few.out(I,boston.data), pop.individuals)))
 
@@ -113,7 +114,8 @@ def test_calc_fitness_shape():
     pop_size = 5;
     few = FEW(population_size=pop_size,seed_with_ml=False)
     few.term_set = term_set
-    pop = few.init_pop(n_features)
+    few.n_features = n_features
+    pop = few.init_pop()
 
     pop.X = np.asarray(list(map(lambda I: few.out(I,boston.data), pop.individuals)))