Merge pull request #32 from lacava/pipeline_fix

Pipeline fix
lacava · Sep 12, 2017 · 0351c6a · 0351c6a
2 parents ee6da54 + 13a333f
commit 0351c6a
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 28 deletions.
diff --git a/few/_version.py b/few/_version.py
@@ -6,4 +6,4 @@
 
 """
 
-__version__ = '0.0.45'
+__version__ = '0.0.46'
diff --git a/few/evaluation.py b/few/evaluation.py
@@ -72,6 +72,8 @@ class EvaluationMixin(object):
         '^2': lambda n,features,stack_float,stack_bool,labels: stack_float.pop()**2,
         '^3': lambda n,features,stack_float,stack_bool,labels: stack_float.pop()**3,
         'sqrt': lambda n,features,stack_float,stack_bool,labels: np.sqrt(np.abs(stack_float.pop())),
+        #'gauss': lambda n,features,stack_float,stack_bool,labels: np.exp(-stack_float.pop()**2),
+        #'gauss2': lambda n,features,stack_float,stack_bool,labels: np.exp(-(stack_float.pop()**2+stack_float.pop()**2)),
         # 'rbf': lambda n,features,stack_float,stack_bool,labels: np.exp(-(np.norm(stack_float.pop()-stack_float.pop())**2)/2)
     # bool operations
         '!': lambda n,features,stack_float,stack_bool,labels: np.logical_not(stack_bool.pop()),

diff --git a/few/few.py b/few/few.py
@@ -60,7 +60,7 @@ def __init__(self, population_size=50, generations=100,
                  scoring_function=None, disable_update_check=False,
                  elitism=True, boolean = False,classification=False,clean=False,
                  track_diversity=False,mdr=False,otype='f',c=True,
-                 weight_parents=True,operators=None, lex_size=False):
+                 weight_parents=True,operators=None, lex_size=False,normalize=True):
                 # sets up GP.
 
         # Save params to be recalled later by get_params()
@@ -107,24 +107,25 @@ def __init__(self, population_size=50, generations=100,
         self.boolean = boolean
         self.classification = classification
         self.clean = clean
-        self.ml = Pipeline([('standardScaler',StandardScaler()), ('ml', ml)])
-        self.ml_type = type(self.ml.named_steps['ml']).__name__
+        self.ml = ml
+        #self.pipeline = Pipeline([('standardScaler',StandardScaler()), ('ml', ml)])
+        self.ml_type = type(self.ml).__name__
         self.track_diversity = track_diversity
         self.mdr = mdr
         self.otype = otype
-
+        self.normalize = normalize
+
         # if otype is b, boolean functions must be turned on
         if self.otype=='b':
             self.boolean = True
 
         # instantiate sklearn estimator according to specified machine learner
-        if self.ml.named_steps['ml'] is None:
+        if self.ml is None:
             if self.classification:
-                self.ml = Pipeline([('standardScaler',StandardScaler()),
-                                    ('ml',LogisticRegression(solver='sag'))])
+                self.ml = LogisticRegression(solver='sag')
             else:
-                self.ml = Pipeline([('standardScaler',StandardScaler()),
-                                    ('ml',LassoLarsCV())])
+                self.ml = LassoLarsCV()
+
         if not self.scoring_function:
             if self.classification:
                 self.scoring_function = accuracy_score
@@ -144,7 +145,7 @@ def __init__(self, population_size=50, generations=100,
                             #classification
                             type(DistanceClassifier()): 'silhouette',
             })
-            self.fit_choice = tmp_dict[type(self.ml.named_steps['ml'])]
+            self.fit_choice = tmp_dict[type(self.ml)]
 
         # Columns to always ignore when in an operator
         self.non_feature_columns = ['label', 'group', 'guess']
@@ -188,12 +189,17 @@ def fit(self, features, labels):
                 print('{}\t=\t{}'.format(arg, self.get_params()[arg]))
             print('')
 
+        # re-initialize pipeline (needs to be here rather than init for GridSearchCV)
+        if self.normalize:
+            self.pipeline = Pipeline([('standardScaler',StandardScaler()), ('ml', self.ml)])
+        else:
+            self.pipeline = Pipeline([('ml',self.ml)])
         ######################################################### initial model
         # fit to original data
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             self._best_score = np.mean(
-                                   [self.ml.fit(features[train],labels[train]).
+                                   [self.pipeline.fit(features[train],labels[train]).
                                    score(features[test],labels[test])
                                    for train, test in KFold().split(features,
                                                                      labels)])
@@ -242,7 +248,7 @@ def fit(self, features, labels):
         # order = 'F')
 
         # calculate fitness of individuals
-        # fitnesses = list(map(lambda I: fitness(I,labels,self.ml),X))
+        # fitnesses = list(map(lambda I: fitness(I,labels,self.pipeline),X))
         self.F = self.calc_fitness(self.X,labels,self.fit_choice,self.sel)
 
         #with Parallel(n_jobs=10) as parallel:
@@ -286,7 +292,7 @@ def fit(self, features, labels):
                 try:
                     if self.valid_loc():
                         tmp_score =  np.mean(
-                            [self.ml.fit(
+                            [self.pipeline.fit(
                             self.X[self.valid_loc(),:].transpose()[train],
                             labels[train]).
                             score(self.X[self.valid_loc(),:].transpose()[test],
@@ -312,7 +318,7 @@ def fit(self, features, labels):
 
             #################################################### save best model
             if self.valid_loc() and tmp_score > self._best_score:
-                self._best_estimator = copy.deepcopy(self.ml)
+                self._best_estimator = copy.deepcopy(self.pipeline)
                 self._best_score = tmp_score
                 stall_count = 0;
                 self._best_inds = copy.deepcopy(self.valid())
@@ -374,7 +380,7 @@ def fit(self, features, labels):
             # training data
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
-                self._best_estimator = self.ml.fit(features,labels)
+                self._best_estimator = self.pipeline.fit(features,labels)
         else:
             # fit final estimator to all the training data
             with warnings.catch_warnings():
@@ -778,6 +784,9 @@ def main():
     parser.add_argument('--mdr', action='store_true',dest='MDR',default=False,
                         help='Use MDR nodes.')
 
+    parser.add_argument('--nonorm', action='store_false',dest='NORMALIZE',default=True,
+                        help='Disable standard scaler preprocessor.')
+
     parser.add_argument('--diversity', action='store_true',
                         dest='TRACK_DIVERSITY', default=False,
                         help='Store diversity of feature transforms each gen.')
@@ -862,7 +871,8 @@ def main():
                   classification=args.CLASSIFICATION,clean = args.CLEAN,
                   track_diversity=args.TRACK_DIVERSITY,mdr=args.MDR,
                   otype=args.OTYPE,c=args.c, lex_size = args.LEX_SIZE,
-                  weight_parents = args.WEIGHT_PARENTS,operators=args.OPS)
+                  weight_parents = args.WEIGHT_PARENTS,operators=args.OPS,
+                  normalize=args.NORMALIZE)
 
     learner.fit(training_features, training_labels)
     # pdb.set_trace()

diff --git a/few/population.py b/few/population.py
@@ -201,13 +201,13 @@ def init_pop(self):
                 # this is needed because svm has a bug that throws valueerror
                 #on attribute check
                 seed_with_raw_features=True
-            elif (hasattr(self.ml.named_steps['ml'],'coef_') or
-                  hasattr(self.ml.named_steps['ml'],'feature_importances_')):
+            elif (hasattr(self.pipeline.named_steps['ml'],'coef_') or
+                  hasattr(self.pipeline.named_steps['ml'],'feature_importances_')):
                 # add model components with non-zero coefficients to initial
                 # population, in order of coefficient size
-                coef = (self.ml.named_steps['ml'].coef_ if
-                        hasattr(self.ml.named_steps['ml'],'coef_') else
-                        self.ml.named_steps['ml'].feature_importances_)
+                coef = (self.pipeline.named_steps['ml'].coef_ if
+                        hasattr(self.pipeline.named_steps['ml'],'coef_') else
+                        self.pipeline.named_steps['ml'].feature_importances_)
                 # compress multiple coefficients for each feature into single
                 # numbers (occurs with multiclass classification)
                 if len(coef.shape)>1:

diff --git a/few/variation.py b/few/variation.py
@@ -23,10 +23,10 @@ def variation(self,parents):
             self.ml_type != 'SVC' and self.ml_type != 'SVR'):
             # this is needed because svm has a bug that throws valueerror on
             # attribute check
-            if hasattr(self.ml.named_steps['ml'],'coef_'):
+            if hasattr(self.pipeline.named_steps['ml'],'coef_'):
                 # for l1 regularization, filter individuals with 0 coefficients
                 if self.weight_parents:
-                    weights = self.ml.named_steps['ml'].coef_
+                    weights = self.pipeline.named_steps['ml'].coef_
                     if len(weights.shape)>1: # handle multi-coefficient models
                         weights = [np.mean(abs(c)) for c in weights.transpose()]
                     # softmax transformation of the weights
@@ -36,20 +36,20 @@ def variation(self,parents):
                                               self.population_size, p=weights)))
                 else:
                     offspring = copy.deepcopy(list(
-                        x for i,x in zip(self.ml.named_steps['ml'].coef_,
+                        x for i,x in zip(self.pipeline.named_steps['ml'].coef_,
                                          self.valid(parents)) if  (i != 0).any()))
-            elif hasattr(self.ml.named_steps['ml'],'feature_importances_'):
+            elif hasattr(self.pipeline.named_steps['ml'],'feature_importances_'):
                 # for tree methods, filter our individuals with 0 feature importance
                 if self.weight_parents:
-                    weights = self.ml.named_steps['ml'].feature_importances_
+                    weights = self.pipeline.named_steps['ml'].feature_importances_
                     # softmax transformation of the weights
                     weights = np.exp(weights)/np.sum(np.exp(weights))
                     offspring = copy.deepcopy(list(
                         np.random.choice(self.valid(parents),
                                          self.population_size, p=weights)))
                 else:
                     offspring = copy.deepcopy(list(
-                        x for i,x in zip(self.ml.named_steps['ml'].feature_importances_,
+                        x for i,x in zip(self.pipeline.named_steps['ml'].feature_importances_,
                                          self.valid(parents)) if  i != 0))
             else:
                 offspring = copy.deepcopy(self.valid(parents))