diff --git a/LANDMark/LANDMark.py b/LANDMark/LANDMark.py index 80a1548..bf0df4a 100644 --- a/LANDMark/LANDMark.py +++ b/LANDMark/LANDMark.py @@ -35,7 +35,6 @@ def __init__( use_etc: bool = True, etc_max_depth: int = 5, etc_max_trees: int = 128, - use_etc_split: bool = False, resampler=None, use_cascade: bool = False, n_jobs: int = 4, @@ -58,7 +57,6 @@ def __init__( self.use_etc = use_etc self.etc_max_depth = etc_max_depth self.etc_max_trees = etc_max_trees - self.use_etc_split = use_etc_split self.resampler = resampler self.use_cascade = use_cascade @@ -99,7 +97,6 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier: use_etc=self.use_etc, etc_max_depth=self.etc_max_depth, etc_max_trees=self.etc_max_trees, - use_etc_split=self.use_etc_split, resampler=self.resampler, use_cascade=self.use_cascade, ), @@ -164,7 +161,7 @@ def proximity(self, X: np.ndarray, prox_type: str = "path") -> np.ndarray: return csr_array(emb.astype(np.uint8)) - else: + elif prox_type == "path": if hasattr(self, "node_set"): embs = [ est.proximity(X, prox_type) for est in self.estimators_.estimators_ @@ -185,18 +182,24 @@ def proximity(self, X: np.ndarray, prox_type: str = "path") -> np.ndarray: return csr_array(emb) else: + # Get the list of nodes associated with each sample in X embs = [ est.proximity(X, prox_type) for est in self.estimators_.estimators_ ] + # Create a list of all nodes across all trees in the forest node_set = set() [node_set.update(est.all_nodes) for est in self.estimators_.estimators_] node_set = list(node_set) + + # Create the embedding matrix emb = np.zeros(shape=(X.shape[0], len(node_set)), dtype=np.uint8) + # Create a mapping between node id and index in the embedding matrix self.node_set = {node: i for i, node in enumerate(node_set)} - + + # Update the embedding matrix for tree_emb in embs: for sample, nodes in tree_emb.items(): for node in nodes: diff --git a/LANDMark/lm_linear_clfs.py b/LANDMark/lm_linear_clfs.py index 55813a9..d0a1078 100644 --- a/LANDMark/lm_linear_clfs.py +++ b/LANDMark/lm_linear_clfs.py @@ -25,7 +25,6 @@ def __init__(self, model_type, n_feat=0.8, minority=6, use_etc_split=True): self.model_type = model_type self.n_feat = n_feat self.minority = minority - self.use_etc_split = use_etc_split def fit(self, X, y): if X.shape[1] >= 4: @@ -103,22 +102,11 @@ def fit(self, X, y): # Otherwise use an Extra Trees Classifier or Nothing else: - if self.use_etc_split: - self.clf = ExtraTreesClassifier(128, max_depth=3).fit(X_re, y_re) - - return self, self.decision_function(X) - - else: - return self, None + return self, None def predict(self, X): return self.clf.predict(X[:, self.features]) def decision_function(self, X): - if self.y_min > self.minority: - return self.clf.decision_function(X[:, self.features]) - - else: - D = self.clf.predict_proba(X[:, self.features]) + return self.clf.decision_function(X[:, self.features]) - return np.where(D > 0.5, 1, -1) diff --git a/LANDMark/lm_nnet_clfs.py b/LANDMark/lm_nnet_clfs.py index 55579af..777d3bb 100644 --- a/LANDMark/lm_nnet_clfs.py +++ b/LANDMark/lm_nnet_clfs.py @@ -61,7 +61,6 @@ class ANNClassifier(ClassifierMixin, BaseEstimator): def __init__(self, n_feat=0.8, minority=6, use_etc_split=True): self.n_feat = n_feat self.minority = minority - self.use_etc_split = use_etc_split def fit(self, X, y): self.model_type = "nonlinear_nnet" @@ -132,13 +131,7 @@ def fit(self, X, y): # Otherwise use an Extra Trees Classifier or Nothing else: - if self.use_etc_split: - self.clf = ExtraTreesClassifier(128, max_depth=3).fit(X_trf, y_trf) - - return self, self.decision_function(X) - - else: - return self, None + return self, None def predict_proba(self, X): if issparse(X): @@ -147,26 +140,22 @@ def predict_proba(self, X): else: X_not_sparse = X - if self.y_min > self.minority: - clf = LMNNet(n_in=self.n_in, n_out=self.n_out) + clf = LMNNet(n_in=self.n_in, n_out=self.n_out) - clf.load_state_dict(self.params) + clf.load_state_dict(self.params) - n_batch = pyt.arange(0, len(X_not_sparse), 16) + n_batch = pyt.arange(0, len(X_not_sparse), 16) - X_tensor = pyt.tensor(X_not_sparse[:, self.features].astype(np.float32)) + X_tensor = pyt.tensor(X_not_sparse[:, self.features].astype(np.float32)) - predictions = [] - for start in n_batch: - p = clf(X_tensor[start : start + 16]).detach().cpu().numpy() - predictions.extend(p) + predictions = [] + for start in n_batch: + p = clf(X_tensor[start : start + 16]).detach().cpu().numpy() + predictions.extend(p) - predictions = np.asarray(predictions) + predictions = np.asarray(predictions) - del clf - - else: - predictions = self.clf.predict_proba(X_not_sparse[:, self.features]) + del clf return predictions diff --git a/LANDMark/tree.py b/LANDMark/tree.py index d2e7c14..adbc652 100644 --- a/LANDMark/tree.py +++ b/LANDMark/tree.py @@ -127,7 +127,6 @@ def get_split( use_etc, etc_max_depth, etc_max_trees, - use_etc_split, N, current_depth, use_oracle, @@ -218,7 +217,6 @@ def get_split( use_etc=use_etc, etc_max_depth=etc_max_depth, etc_max_trees=etc_max_trees, - use_etc_split=use_etc_split, N=X.shape[0], current_depth=current_depth + 1, use_oracle=False, @@ -243,7 +241,6 @@ def get_split( use_etc=use_etc, etc_max_depth=etc_max_depth, etc_max_trees=etc_max_trees, - use_etc_split=use_etc_split, N=X.shape[0], current_depth=current_depth + 1, use_oracle=False, @@ -263,25 +260,21 @@ def get_split( model_type="lr_l2", n_feat=max_features, minority=minority_sz_lm, - use_etc_split=use_etc_split, ), LMClassifier( model_type="sgd_l2", n_feat=max_features, minority=minority_sz_lm, - use_etc_split=use_etc_split, ), LMClassifier( model_type="ridge", n_feat=max_features, minority=minority_sz_lm, - use_etc_split=use_etc_split, ), LMClassifier( model_type="lsvc", n_feat=max_features, minority=minority_sz_lm, - use_etc_split=use_etc_split, ), ]: model, D = clf.fit(X, y) @@ -313,13 +306,11 @@ def get_split( model_type="lr_l1", n_feat=max_features, minority=minority_sz_lm, - use_etc_split=use_etc_split, ), LMClassifier( model_type="sgd_l1", n_feat=max_features, minority=minority_sz_lm, - use_etc_split=use_etc_split, ), ]: model, D = clf.fit(X, y) @@ -351,7 +342,6 @@ def get_split( ANNClassifier( n_feat=max_features, minority=minority_sz_nnet, - use_etc_split=use_etc_split, ) ]: model, D = clf.fit(X, y) @@ -465,7 +455,6 @@ def get_split( use_etc=use_etc, etc_max_depth=etc_max_depth, etc_max_trees=etc_max_trees, - use_etc_split=use_etc_split, N=X.shape[0], current_depth=current_depth + 1, use_oracle=use_oracle, @@ -490,7 +479,6 @@ def get_split( use_etc=use_etc, etc_max_depth=etc_max_depth, etc_max_trees=etc_max_trees, - use_etc_split=use_etc_split, N=X.shape[0], current_depth=current_depth + 1, use_oracle=use_oracle, @@ -548,7 +536,6 @@ def __init__( self.use_etc = use_etc self.etc_max_depth = etc_max_depth self.etc_max_trees = etc_max_trees - self.use_etc_split = use_etc_split self.resampler = resampler self.use_cascade = use_cascade @@ -587,7 +574,6 @@ def fit(self, X, y): use_etc=self.use_etc, etc_max_depth=self.etc_max_depth, etc_max_trees=self.etc_max_trees, - use_etc_split=self.use_etc_split, N=X.shape[0], current_depth=1, use_oracle=self.use_oracle, diff --git a/tests/test_landmark.py b/tests/test_landmark.py index 805a098..4d9ad36 100644 --- a/tests/test_landmark.py +++ b/tests/test_landmark.py @@ -84,11 +84,6 @@ def test_models(): BAcc = balanced_accuracy_score(y_test, p) assert BAcc > 0.7 - # Tests the ANN for min samples - clf, _ = ANNClassifier().fit(X_train[0:4], y_train[0:4]) - p = clf.predict(X_test) - D = clf.decision_function(X_test) - clf, _ = ETClassifier().fit(X_train, y_train) p = clf.predict(X_test) BAcc = balanced_accuracy_score(y_test, p) @@ -105,11 +100,6 @@ def test_models(): assert BAcc > 0.7 D = clf.decision_function(X_test) - # Tests the ETC model for min samples - clf, _ = LMClassifier(model_type="lr_l2").fit(X_train[0:4], y_train[0:4]) - p = clf.predict(X_test) - D = clf.decision_function(X_test) - clf, _ = LMClassifier(model_type="sgd_l2").fit(X_train, y_train) p = clf.predict(X_test) BAcc = balanced_accuracy_score(y_test, p)