- Removed ability of models to use ExtraTreesClassifier at each node.

- Updated tests to reflect above
jrudar · Jul 12, 2023 · f8a2dba · f8a2dba
1 parent 1159692
commit f8a2dba
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 65 deletions.
diff --git a/LANDMark/LANDMark.py b/LANDMark/LANDMark.py
@@ -35,7 +35,6 @@ def __init__(
         use_etc: bool = True,
         etc_max_depth: int = 5,
         etc_max_trees: int = 128,
-        use_etc_split: bool = False,
         resampler=None,
         use_cascade: bool = False,
         n_jobs: int = 4,
@@ -58,7 +57,6 @@ def __init__(
         self.use_etc = use_etc
         self.etc_max_depth = etc_max_depth
         self.etc_max_trees = etc_max_trees
-        self.use_etc_split = use_etc_split
         self.resampler = resampler
         self.use_cascade = use_cascade
 
@@ -99,7 +97,6 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier:
                 use_etc=self.use_etc,
                 etc_max_depth=self.etc_max_depth,
                 etc_max_trees=self.etc_max_trees,
-                use_etc_split=self.use_etc_split,
                 resampler=self.resampler,
                 use_cascade=self.use_cascade,
             ),
@@ -164,7 +161,7 @@ def proximity(self, X: np.ndarray, prox_type: str = "path") -> np.ndarray:
 
             return csr_array(emb.astype(np.uint8))
 
-        else:
+        elif prox_type == "path":
             if hasattr(self, "node_set"):
                 embs = [
                     est.proximity(X, prox_type) for est in self.estimators_.estimators_
@@ -185,18 +182,24 @@ def proximity(self, X: np.ndarray, prox_type: str = "path") -> np.ndarray:
                 return csr_array(emb)
 
             else:
+                # Get the list of nodes associated with each sample in X
                 embs = [
                     est.proximity(X, prox_type) for est in self.estimators_.estimators_
                 ]
 
+                # Create a list of all nodes across all trees in the forest
                 node_set = set()
                 [node_set.update(est.all_nodes) for est in self.estimators_.estimators_]
 
                 node_set = list(node_set)
+
+                # Create the embedding matrix
                 emb = np.zeros(shape=(X.shape[0], len(node_set)), dtype=np.uint8)
 
+                # Create a mapping between node id and index in the embedding matrix
                 self.node_set = {node: i for i, node in enumerate(node_set)}
-
+
+                # Update the embedding matrix
                 for tree_emb in embs:
                     for sample, nodes in tree_emb.items():
                         for node in nodes:

diff --git a/LANDMark/lm_linear_clfs.py b/LANDMark/lm_linear_clfs.py
@@ -25,7 +25,6 @@ def __init__(self, model_type, n_feat=0.8, minority=6, use_etc_split=True):
         self.model_type = model_type
         self.n_feat = n_feat
         self.minority = minority
-        self.use_etc_split = use_etc_split
 
     def fit(self, X, y):
         if X.shape[1] >= 4:
@@ -103,22 +102,11 @@ def fit(self, X, y):
 
         # Otherwise use an Extra Trees Classifier or Nothing
         else:
-            if self.use_etc_split:
-                self.clf = ExtraTreesClassifier(128, max_depth=3).fit(X_re, y_re)
-
-                return self, self.decision_function(X)
-
-            else:
-                return self, None
+            return self, None
 
     def predict(self, X):
         return self.clf.predict(X[:, self.features])
 
     def decision_function(self, X):
-        if self.y_min > self.minority:
-            return self.clf.decision_function(X[:, self.features])
-
-        else:
-            D = self.clf.predict_proba(X[:, self.features])
+        return self.clf.decision_function(X[:, self.features])
 
-            return np.where(D > 0.5, 1, -1)
diff --git a/LANDMark/lm_nnet_clfs.py b/LANDMark/lm_nnet_clfs.py
@@ -61,7 +61,6 @@ class ANNClassifier(ClassifierMixin, BaseEstimator):
     def __init__(self, n_feat=0.8, minority=6, use_etc_split=True):
         self.n_feat = n_feat
         self.minority = minority
-        self.use_etc_split = use_etc_split
 
     def fit(self, X, y):
         self.model_type = "nonlinear_nnet"
@@ -132,13 +131,7 @@ def fit(self, X, y):
 
         # Otherwise use an Extra Trees Classifier or Nothing
         else:
-            if self.use_etc_split:
-                self.clf = ExtraTreesClassifier(128, max_depth=3).fit(X_trf, y_trf)
-
-                return self, self.decision_function(X)
-
-            else:
-                return self, None
+            return self, None
 
     def predict_proba(self, X):
         if issparse(X):
@@ -147,26 +140,22 @@ def predict_proba(self, X):
         else:
             X_not_sparse = X
 
-        if self.y_min > self.minority:
-            clf = LMNNet(n_in=self.n_in, n_out=self.n_out)
+        clf = LMNNet(n_in=self.n_in, n_out=self.n_out)
 
-            clf.load_state_dict(self.params)
+        clf.load_state_dict(self.params)
 
-            n_batch = pyt.arange(0, len(X_not_sparse), 16)
+        n_batch = pyt.arange(0, len(X_not_sparse), 16)
 
-            X_tensor = pyt.tensor(X_not_sparse[:, self.features].astype(np.float32))
+        X_tensor = pyt.tensor(X_not_sparse[:, self.features].astype(np.float32))
 
-            predictions = []
-            for start in n_batch:
-                p = clf(X_tensor[start : start + 16]).detach().cpu().numpy()
-                predictions.extend(p)
+        predictions = []
+        for start in n_batch:
+            p = clf(X_tensor[start : start + 16]).detach().cpu().numpy()
+            predictions.extend(p)
 
-            predictions = np.asarray(predictions)
+        predictions = np.asarray(predictions)
 
-            del clf
-
-        else:
-            predictions = self.clf.predict_proba(X_not_sparse[:, self.features])
+        del clf
 
         return predictions
 

diff --git a/LANDMark/tree.py b/LANDMark/tree.py
@@ -127,7 +127,6 @@ def get_split(
         use_etc,
         etc_max_depth,
         etc_max_trees,
-        use_etc_split,
         N,
         current_depth,
         use_oracle,
@@ -218,7 +217,6 @@ def get_split(
                     use_etc=use_etc,
                     etc_max_depth=etc_max_depth,
                     etc_max_trees=etc_max_trees,
-                    use_etc_split=use_etc_split,
                     N=X.shape[0],
                     current_depth=current_depth + 1,
                     use_oracle=False,
@@ -243,7 +241,6 @@ def get_split(
                     use_etc=use_etc,
                     etc_max_depth=etc_max_depth,
                     etc_max_trees=etc_max_trees,
-                    use_etc_split=use_etc_split,
                     N=X.shape[0],
                     current_depth=current_depth + 1,
                     use_oracle=False,
@@ -263,25 +260,21 @@ def get_split(
                             model_type="lr_l2",
                             n_feat=max_features,
                             minority=minority_sz_lm,
-                            use_etc_split=use_etc_split,
                         ),
                         LMClassifier(
                             model_type="sgd_l2",
                             n_feat=max_features,
                             minority=minority_sz_lm,
-                            use_etc_split=use_etc_split,
                         ),
                         LMClassifier(
                             model_type="ridge",
                             n_feat=max_features,
                             minority=minority_sz_lm,
-                            use_etc_split=use_etc_split,
                         ),
                         LMClassifier(
                             model_type="lsvc",
                             n_feat=max_features,
                             minority=minority_sz_lm,
-                            use_etc_split=use_etc_split,
                         ),
                     ]:
                         model, D = clf.fit(X, y)
@@ -313,13 +306,11 @@ def get_split(
                             model_type="lr_l1",
                             n_feat=max_features,
                             minority=minority_sz_lm,
-                            use_etc_split=use_etc_split,
                         ),
                         LMClassifier(
                             model_type="sgd_l1",
                             n_feat=max_features,
                             minority=minority_sz_lm,
-                            use_etc_split=use_etc_split,
                         ),
                     ]:
                         model, D = clf.fit(X, y)
@@ -351,7 +342,6 @@ def get_split(
                             ANNClassifier(
                                 n_feat=max_features,
                                 minority=minority_sz_nnet,
-                                use_etc_split=use_etc_split,
                             )
                         ]:
                             model, D = clf.fit(X, y)
@@ -465,7 +455,6 @@ def get_split(
                         use_etc=use_etc,
                         etc_max_depth=etc_max_depth,
                         etc_max_trees=etc_max_trees,
-                        use_etc_split=use_etc_split,
                         N=X.shape[0],
                         current_depth=current_depth + 1,
                         use_oracle=use_oracle,
@@ -490,7 +479,6 @@ def get_split(
                         use_etc=use_etc,
                         etc_max_depth=etc_max_depth,
                         etc_max_trees=etc_max_trees,
-                        use_etc_split=use_etc_split,
                         N=X.shape[0],
                         current_depth=current_depth + 1,
                         use_oracle=use_oracle,
@@ -548,7 +536,6 @@ def __init__(
         self.use_etc = use_etc
         self.etc_max_depth = etc_max_depth
         self.etc_max_trees = etc_max_trees
-        self.use_etc_split = use_etc_split
         self.resampler = resampler
         self.use_cascade = use_cascade
 
@@ -587,7 +574,6 @@ def fit(self, X, y):
             use_etc=self.use_etc,
             etc_max_depth=self.etc_max_depth,
             etc_max_trees=self.etc_max_trees,
-            use_etc_split=self.use_etc_split,
             N=X.shape[0],
             current_depth=1,
             use_oracle=self.use_oracle,

diff --git a/tests/test_landmark.py b/tests/test_landmark.py
@@ -84,11 +84,6 @@ def test_models():
     BAcc = balanced_accuracy_score(y_test, p)
     assert BAcc > 0.7
 
-    # Tests the ANN for min samples
-    clf, _ = ANNClassifier().fit(X_train[0:4], y_train[0:4])
-    p = clf.predict(X_test)
-    D = clf.decision_function(X_test)
-
     clf, _ = ETClassifier().fit(X_train, y_train)
     p = clf.predict(X_test)
     BAcc = balanced_accuracy_score(y_test, p)
@@ -105,11 +100,6 @@ def test_models():
     assert BAcc > 0.7
     D = clf.decision_function(X_test)
 
-    # Tests the ETC model for min samples
-    clf, _ = LMClassifier(model_type="lr_l2").fit(X_train[0:4], y_train[0:4])
-    p = clf.predict(X_test)
-    D = clf.decision_function(X_test)
-
     clf, _ = LMClassifier(model_type="sgd_l2").fit(X_train, y_train)
     p = clf.predict(X_test)
     BAcc = balanced_accuracy_score(y_test, p)