Skip to content

Commit

Permalink
Merge branch 'master' into lexicase_survival
Browse files Browse the repository at this point in the history
  • Loading branch information
lacava committed Aug 1, 2017
2 parents 439bf05 + bee67c4 commit 2b6b9a4
Show file tree
Hide file tree
Showing 13 changed files with 348 additions and 296 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ analysis/
*.ipynb
*.so
*.o
*.cpp
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

# Few

**Few** is a **Feature Engineering Wrapper** for sci-kitlearn. Few looks for a set of feature transformations that work best with a specified machine learning algorithm in order to improve model estimation and prediction. In doing so, Few is able to provide the user with a set of concise, engineered features that describe their data.
**Few** is a **Feature Engineering Wrapper** for scikit-learn. Few looks for a set of feature transformations that work best with a specified machine learning algorithm in order to improve model estimation and prediction. In doing so, Few is able to provide the user with a set of concise, engineered features that describe their data.

Few uses genetic programming to generate, search and update engineered features. It incorporates feedback from the ML process to select important features, while also scoring them internally.

Expand Down
2 changes: 1 addition & 1 deletion few/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
"""

__version__ = '0.0.37'
__version__ = '0.0.44'
6 changes: 3 additions & 3 deletions few/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ class EvaluationMixin(object):
'separation': lambda y,yhat: 1 - separation(yhat,y),
'fisher': lambda y,yhat: 1 - fisher(yhat,y),
'accuracy': lambda y,yhat: 1 - accuracy_score(yhat,y),
'random': lambda y,yhat: np.random.rand(),
'random': lambda y,yhat: self.random_state.rand(),
'roc_auc': lambda y,yhat: 1 - roc_auc_score(y,yhat)
# 'relief': lambda y,yhat: 1-ReliefF(n_jobs=-1).fit(yhat.reshape(-1,1),y).feature_importances_
}
Expand All @@ -122,7 +122,7 @@ class EvaluationMixin(object):
'separation': lambda y,yhat: 1 - separation(yhat,y,samples=True),
'fisher': lambda y,yhat: 1 - fisher(yhat,y,samples=True),
'accuracy': lambda y,yhat: 1 - np.sum(yhat==y)/y.shape[0], # this looks wrong, CHECK
'random': lambda y,yhat: np.random.rand(len(y)),
'random': lambda y,yhat: self.random_state.rand(len(y)),
# 'relief': lambda y,yhat: 1-ReliefF(n_jobs=-1,sample_scores=True).fit(yhat.reshape(-1,1),y).feature_importances_
}

Expand All @@ -137,7 +137,7 @@ class EvaluationMixin(object):
# 'separation': 1 - separation(yhat,y,samples=True),
# 'fisher': 1 - fisher(yhat,y,samples=True),
# 'accuracy': 1 - np.sum(yhat==y)/y.shape[0],
# 'random': np.random.rand(len(y)),
# 'random': self.random_state.rand(len(y)),
# # 'relief': 1-ReliefF(n_jobs=-1,sample_scores=True).fit(yhat.reshape(-1,1),y).feature_importances_
# }

Expand Down
317 changes: 112 additions & 205 deletions few/few.py

Large diffs are not rendered by default.

46 changes: 46 additions & 0 deletions few/lib/evaluation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/* evaluation c++ code
Copyright 2016 William La Cava
This file is part of the FEW library.
The FEW library is free software: you can redistribute it and/or
modify it under the terms of the GNU General Public License as published by the
Free Software Foundation, either version 3 of the License, or (at your option)
any later version.
The FEW library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
the FEW library. If not, see http://www.gnu.org/licenses/.
*/
#include <iostream>
#include "Eigen/Dense"
#include <Python.h>
using namespace Eigen;
using namespace std;

/* DEFINE Custom Type Names to make code more readable
ExtMat : 2-dim matrix/array externally defined (in Python)
*/
typedef Map<ArrayXXd> ExtMat;
typedef ArrayXXd Mat;
typedef ArrayXd Vec;


void evaluate(node n, ExtMat& features, vector<Vec> stack_float, vector<Vec> stack_bool)
{
//evalute a program node on a given set of data.
ExtMat F (features, n, d);

vector<float> stack_float;
vector<float> stack_bool;
for (auto n: program){
// evaluate program nodes on stack
evaluate(n,F,stack_float,stack_bool);
}
}

void out(vector<node> program, ExtMat& features, char otype){
// evaluate program output.
}
2 changes: 1 addition & 1 deletion few/lib/few_lib.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# distutils: language = c++
# distutils: language=c++
from eigency.core cimport *
cimport numpy as np
from libcpp.vector cimport vector
Expand Down
159 changes: 117 additions & 42 deletions few/population.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import uuid
from mdr import MDR
from collections import defaultdict
import itertools as it

eqn_dict = {
'+': lambda n,stack_eqn: '(' + stack_eqn.pop() + '+' + stack_eqn.pop() + ')',
Expand Down Expand Up @@ -130,7 +131,7 @@ def __init__(self,fitness = -1.0,stack = None):

class Pop(object):
"""class representing population"""
def __init__(self,pop_size=100,n_samples=1, fit = None):
def __init__(self,pop_size=100, fit = None):
"""initializes population of inds of size pop_size"""

self.individuals = []
Expand All @@ -141,46 +142,120 @@ def __init__(self,pop_size=100,n_samples=1, fit = None):
else:
self.individuals.append(Ind(fitness = fit))

def stacks_2_eqns(stacks):
"""returns equation strings from stacks"""
if stacks:
return list(map(lambda p: stack_2_eqn(p), stacks))
else:
class PopMixin(object):
"""methods for constructing features."""
######################################################## printing equations
def eval_eqn(self,n,stack_eqn):
if len(stack_eqn) >= n.arity['f']+n.arity['b']:
stack_eqn.append(eqn_dict[n.name](n,stack_eqn))

def stack_2_eqn(self,p):
"""returns equation string for program stack"""
stack_eqn = []
if p: # if stack is not empty
for n in p.stack:
self.eval_eqn(n,stack_eqn)
return stack_eqn[-1]
return []

def stack_2_eqn(p):
"""returns equation string for program stack"""
stack_eqn = []
if p: # if stack is not empty
for n in p.stack:
eval_eqn(n,stack_eqn)
return stack_eqn[-1]
return []

def eval_eqn(n,stack_eqn):
if len(stack_eqn) >= n.arity['f']+n.arity['b']:
stack_eqn.append(eqn_dict[n.name](n,stack_eqn))
# if any(np.isnan(stack_eqn[-1])) or any(np.isinf(stack_eqn[-1])):
# print("problem operator:",n)

def make_program(stack,func_set,term_set,max_d,ntype):
"""makes a program stack"""
# print("stack:",stack,"max d:",max_d)
if max_d == 0: #or np.random.rand() < float(len(term_set))/(len(term_set)+len(func_set)):
ts = [t for t in term_set if t.out_type==ntype]

if not ts:
raise ValueError('no ts. ntype:'+ntype+'. term_set out_types:'+','.join([t.out_type for t in term_set]))

stack.append(ts[np.random.choice(len(ts))])
else:
fs = [f for f in func_set if (f.out_type==ntype and (f.in_type=='f' or max_d>1))]
if len(fs)==0:
print('ntype:',ntype,'\nfunc_set:',[f.name for f in func_set])
stack.append(fs[np.random.choice(len(fs))])
tmp = copy.copy(stack[-1])

for i in np.arange(tmp.arity['f']):
make_program(stack,func_set,term_set,max_d-1,'f')
for i in np.arange(tmp.arity['b']):
make_program(stack,func_set,term_set,max_d-1,'b')
def stacks_2_eqns(self,stacks):
"""returns equation strings from stacks"""
if stacks:
return list(map(lambda p: self.stack_2_eqn(p), stacks))
else:
return []

########################################################### making programs
def make_program(self,stack,func_set,term_set,max_d,ntype):
"""makes a program stack"""
# print("stack:",stack,"max d:",max_d)
if max_d == 0:
ts = [t for t in term_set if t.out_type==ntype]

if not ts:
raise ValueError('no ts. ntype:'+ntype+'. term_set out_types:'+
','.join([t.out_type for t in term_set]))

stack.append(ts[np.random.choice(len(ts))])
else:
fs = [f for f in func_set if (f.out_type==ntype
and (f.in_type=='f' or max_d>1))]
if len(fs)==0:
print('ntype:',ntype,'\nfunc_set:',[f.name for f in func_set])
stack.append(fs[np.random.choice(len(fs))])
tmp = copy.copy(stack[-1])

for i in np.arange(tmp.arity['f']):
self.make_program(stack,func_set,term_set,max_d-1,'f')
for i in np.arange(tmp.arity['b']):
self.make_program(stack,func_set,term_set,max_d-1,'b')

def init_pop(self):
"""initializes population of features as GP stacks."""
pop = Pop(self.population_size)
seed_with_raw_features = False
# make programs
if self.seed_with_ml:
# initial population is the components of the default ml model
if (self.ml_type == 'SVC' or self.ml_type == 'SVR'):
# this is needed because svm has a bug that throws valueerror
#on attribute check
seed_with_raw_features=True
elif (hasattr(self.ml.named_steps['ml'],'coef_') or
hasattr(self.ml.named_steps['ml'],'feature_importances_')):
# add model components with non-zero coefficients to initial
# population, in order of coefficient size
coef = (self.ml.named_steps['ml'].coef_ if
hasattr(self.ml.named_steps['ml'],'coef_') else
self.ml.named_steps['ml'].feature_importances_)
# compress multiple coefficients for each feature into single
# numbers (occurs with multiclass classification)
if len(coef.shape)>1:
coef = [np.mean(abs(c)) for c in coef.transpose()]

# remove zeros
coef = [c for c in coef if c!=0]
# sort feature locations based on importance/coefficient
locs = np.arange(len(coef))
locs = locs[np.argsort(np.abs(coef))[::-1]]
for i,p in enumerate(pop.individuals):
if i < len(locs):
p.stack = [node('x',loc=locs[i])]
else:
# make program if pop is bigger than n_features
self.make_program(p.stack,self.func_set,self.term_set,
self.random_state.randint(self.min_depth,
self.max_depth+1),
self.otype)
p.stack = list(reversed(p.stack))
else:
seed_with_raw_features = True
# seed with random features if no importance info available
if seed_with_raw_features:
for i,p in enumerate(pop.individuals):
if i < self.n_features:
p.stack = [node('x',
loc=np.random.randint(self.n_features))]
else:
# make program if pop is bigger than n_features
self.make_program(p.stack,self.func_set,self.term_set,
self.random_state.randint(self.min_depth,
self.max_depth+1),
self.otype)
p.stack = list(reversed(p.stack))

# print initial population
if self.verbosity > 2:
print("seeded initial population:",
self.stacks_2_eqns(pop.individuals))

else: # don't seed with ML
for I in pop.individuals:
depth = self.random_state.randint(self.min_depth,
self.max_depth+1)
self.make_program(I.stack,self.func_set,self.term_set,depth,
self.otype)
# print(I.stack)
I.stack = list(reversed(I.stack))

return pop
13 changes: 6 additions & 7 deletions few/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import copy
import pdb
from sklearn.metrics import r2_score
from .population import stacks_2_eqns
from few_lib import ep_lex
# from profilehooks import profile

Expand All @@ -36,7 +35,7 @@ def survival(self,parents,offspring,elite=None,elite_index=None,X=None,F=None,F_
survivors, survivor_index = self.deterministic_crowding(parents,offspring,X,X_offspring)
elif self.sel == 'random':
# pdb.set_trace()
survivor_index = np.random.permutation(np.arange(2*len(parents)))[:len(parents)]
survivor_index = self.random_state.permutation(np.arange(2*len(parents)))[:len(parents)]
survivors = [(parents + offspring)[s] for s in survivor_index]
# elitism
if self.elitism:
Expand All @@ -58,7 +57,7 @@ def tournament(self,individuals,tourn_size, num_selections=None):

for i in np.arange(num_selections):
# sample pool with replacement
pool_i = np.random.choice(len(individuals),size=tourn_size)
pool_i = self.random_state.choice(len(individuals),size=tourn_size)
pool = []
for i in pool_i:
pool.append(np.mean(individuals[i].fitness))
Expand Down Expand Up @@ -92,7 +91,7 @@ def lexicase(self,individuals, num_selections=None, epsilon = False, survival =
candidates = individuals
can_locs = range(len(individuals))
cases = list(np.arange(len(individuals[0].fitness_vec)))
np.random.shuffle(cases)
self.random_state.shuffle(cases)
# pdb.set_trace()
while len(cases) > 0 and len(candidates) > 1:
# get best fitness for case among candidates
Expand All @@ -105,7 +104,7 @@ def lexicase(self,individuals, num_selections=None, epsilon = False, survival =
candidates,can_locs = zip(*((x,l) for x,l in zip(candidates,can_locs) if x.fitness_vec[cases[0]] == best_val_for_case))
cases.pop(0)

choice = np.random.randint(len(candidates))
choice = self.random_state.randint(len(candidates))
winners.append(copy.deepcopy(candidates[choice]))
locs.append(can_locs[choice])
if survival: # filter out winners from remaining selection pool
Expand Down Expand Up @@ -137,7 +136,7 @@ def epsilon_lexicase(self, F, sizes, num_selections=None, survival = False):

can_locs = individual_locs
cases = list(np.arange(F.shape[1]))
np.random.shuffle(cases)
self.random_state.shuffle(cases)
# pdb.set_trace()
while len(cases) > 0 and len(can_locs) > 1:
# get best fitness for case among candidates
Expand All @@ -146,7 +145,7 @@ def epsilon_lexicase(self, F, sizes, num_selections=None, survival = False):
can_locs = [l for l in can_locs if F[l,cases[0]] <= best_val_for_case + mad_for_case[cases[0]]]
cases.pop(0)

choice = np.random.randint(len(can_locs))
choice = self.random_state.randint(len(can_locs))
locs.append(can_locs[choice])
if survival: # filter out winners from remaining selection pool
individual_locs = [i for i in individual_locs if i != can_locs[choice]]
Expand Down
6 changes: 4 additions & 2 deletions few/tests/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def test_out_shapes():
pop_size = 5;
few = FEW(population_size=pop_size,seed_with_ml=False)
few.term_set = term_set
pop = few.init_pop(n_features)
few.n_features = n_features
pop = few.init_pop()

pop.X = np.asarray(list(map(lambda I: few.out(I,boston.data), pop.individuals)))

Expand Down Expand Up @@ -113,7 +114,8 @@ def test_calc_fitness_shape():
pop_size = 5;
few = FEW(population_size=pop_size,seed_with_ml=False)
few.term_set = term_set
pop = few.init_pop(n_features)
few.n_features = n_features
pop = few.init_pop()

pop.X = np.asarray(list(map(lambda I: few.out(I,boston.data), pop.individuals)))

Expand Down
Loading

0 comments on commit 2b6b9a4

Please sign in to comment.