Skip to content

Commit

Permalink
- Removed ability of models to use ExtraTreesClassifier at each node.
Browse files Browse the repository at this point in the history
- Updated tests to reflect above
  • Loading branch information
jrudar committed Jul 12, 2023
1 parent 1159692 commit f8a2dba
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 65 deletions.
13 changes: 8 additions & 5 deletions LANDMark/LANDMark.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def __init__(
use_etc: bool = True,
etc_max_depth: int = 5,
etc_max_trees: int = 128,
use_etc_split: bool = False,
resampler=None,
use_cascade: bool = False,
n_jobs: int = 4,
Expand All @@ -58,7 +57,6 @@ def __init__(
self.use_etc = use_etc
self.etc_max_depth = etc_max_depth
self.etc_max_trees = etc_max_trees
self.use_etc_split = use_etc_split
self.resampler = resampler
self.use_cascade = use_cascade

Expand Down Expand Up @@ -99,7 +97,6 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier:
use_etc=self.use_etc,
etc_max_depth=self.etc_max_depth,
etc_max_trees=self.etc_max_trees,
use_etc_split=self.use_etc_split,
resampler=self.resampler,
use_cascade=self.use_cascade,
),
Expand Down Expand Up @@ -164,7 +161,7 @@ def proximity(self, X: np.ndarray, prox_type: str = "path") -> np.ndarray:

return csr_array(emb.astype(np.uint8))

else:
elif prox_type == "path":
if hasattr(self, "node_set"):
embs = [
est.proximity(X, prox_type) for est in self.estimators_.estimators_
Expand All @@ -185,18 +182,24 @@ def proximity(self, X: np.ndarray, prox_type: str = "path") -> np.ndarray:
return csr_array(emb)

else:
# Get the list of nodes associated with each sample in X
embs = [
est.proximity(X, prox_type) for est in self.estimators_.estimators_
]

# Create a list of all nodes across all trees in the forest
node_set = set()
[node_set.update(est.all_nodes) for est in self.estimators_.estimators_]

node_set = list(node_set)

# Create the embedding matrix
emb = np.zeros(shape=(X.shape[0], len(node_set)), dtype=np.uint8)

# Create a mapping between node id and index in the embedding matrix
self.node_set = {node: i for i, node in enumerate(node_set)}


# Update the embedding matrix
for tree_emb in embs:
for sample, nodes in tree_emb.items():
for node in nodes:
Expand Down
16 changes: 2 additions & 14 deletions LANDMark/lm_linear_clfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def __init__(self, model_type, n_feat=0.8, minority=6, use_etc_split=True):
self.model_type = model_type
self.n_feat = n_feat
self.minority = minority
self.use_etc_split = use_etc_split

def fit(self, X, y):
if X.shape[1] >= 4:
Expand Down Expand Up @@ -103,22 +102,11 @@ def fit(self, X, y):

# Otherwise use an Extra Trees Classifier or Nothing
else:
if self.use_etc_split:
self.clf = ExtraTreesClassifier(128, max_depth=3).fit(X_re, y_re)

return self, self.decision_function(X)

else:
return self, None
return self, None

def predict(self, X):
return self.clf.predict(X[:, self.features])

def decision_function(self, X):
if self.y_min > self.minority:
return self.clf.decision_function(X[:, self.features])

else:
D = self.clf.predict_proba(X[:, self.features])
return self.clf.decision_function(X[:, self.features])

return np.where(D > 0.5, 1, -1)
33 changes: 11 additions & 22 deletions LANDMark/lm_nnet_clfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ class ANNClassifier(ClassifierMixin, BaseEstimator):
def __init__(self, n_feat=0.8, minority=6, use_etc_split=True):
self.n_feat = n_feat
self.minority = minority
self.use_etc_split = use_etc_split

def fit(self, X, y):
self.model_type = "nonlinear_nnet"
Expand Down Expand Up @@ -132,13 +131,7 @@ def fit(self, X, y):

# Otherwise use an Extra Trees Classifier or Nothing
else:
if self.use_etc_split:
self.clf = ExtraTreesClassifier(128, max_depth=3).fit(X_trf, y_trf)

return self, self.decision_function(X)

else:
return self, None
return self, None

def predict_proba(self, X):
if issparse(X):
Expand All @@ -147,26 +140,22 @@ def predict_proba(self, X):
else:
X_not_sparse = X

if self.y_min > self.minority:
clf = LMNNet(n_in=self.n_in, n_out=self.n_out)
clf = LMNNet(n_in=self.n_in, n_out=self.n_out)

clf.load_state_dict(self.params)
clf.load_state_dict(self.params)

n_batch = pyt.arange(0, len(X_not_sparse), 16)
n_batch = pyt.arange(0, len(X_not_sparse), 16)

X_tensor = pyt.tensor(X_not_sparse[:, self.features].astype(np.float32))
X_tensor = pyt.tensor(X_not_sparse[:, self.features].astype(np.float32))

predictions = []
for start in n_batch:
p = clf(X_tensor[start : start + 16]).detach().cpu().numpy()
predictions.extend(p)
predictions = []
for start in n_batch:
p = clf(X_tensor[start : start + 16]).detach().cpu().numpy()
predictions.extend(p)

predictions = np.asarray(predictions)
predictions = np.asarray(predictions)

del clf

else:
predictions = self.clf.predict_proba(X_not_sparse[:, self.features])
del clf

return predictions

Expand Down
14 changes: 0 additions & 14 deletions LANDMark/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ def get_split(
use_etc,
etc_max_depth,
etc_max_trees,
use_etc_split,
N,
current_depth,
use_oracle,
Expand Down Expand Up @@ -218,7 +217,6 @@ def get_split(
use_etc=use_etc,
etc_max_depth=etc_max_depth,
etc_max_trees=etc_max_trees,
use_etc_split=use_etc_split,
N=X.shape[0],
current_depth=current_depth + 1,
use_oracle=False,
Expand All @@ -243,7 +241,6 @@ def get_split(
use_etc=use_etc,
etc_max_depth=etc_max_depth,
etc_max_trees=etc_max_trees,
use_etc_split=use_etc_split,
N=X.shape[0],
current_depth=current_depth + 1,
use_oracle=False,
Expand All @@ -263,25 +260,21 @@ def get_split(
model_type="lr_l2",
n_feat=max_features,
minority=minority_sz_lm,
use_etc_split=use_etc_split,
),
LMClassifier(
model_type="sgd_l2",
n_feat=max_features,
minority=minority_sz_lm,
use_etc_split=use_etc_split,
),
LMClassifier(
model_type="ridge",
n_feat=max_features,
minority=minority_sz_lm,
use_etc_split=use_etc_split,
),
LMClassifier(
model_type="lsvc",
n_feat=max_features,
minority=minority_sz_lm,
use_etc_split=use_etc_split,
),
]:
model, D = clf.fit(X, y)
Expand Down Expand Up @@ -313,13 +306,11 @@ def get_split(
model_type="lr_l1",
n_feat=max_features,
minority=minority_sz_lm,
use_etc_split=use_etc_split,
),
LMClassifier(
model_type="sgd_l1",
n_feat=max_features,
minority=minority_sz_lm,
use_etc_split=use_etc_split,
),
]:
model, D = clf.fit(X, y)
Expand Down Expand Up @@ -351,7 +342,6 @@ def get_split(
ANNClassifier(
n_feat=max_features,
minority=minority_sz_nnet,
use_etc_split=use_etc_split,
)
]:
model, D = clf.fit(X, y)
Expand Down Expand Up @@ -465,7 +455,6 @@ def get_split(
use_etc=use_etc,
etc_max_depth=etc_max_depth,
etc_max_trees=etc_max_trees,
use_etc_split=use_etc_split,
N=X.shape[0],
current_depth=current_depth + 1,
use_oracle=use_oracle,
Expand All @@ -490,7 +479,6 @@ def get_split(
use_etc=use_etc,
etc_max_depth=etc_max_depth,
etc_max_trees=etc_max_trees,
use_etc_split=use_etc_split,
N=X.shape[0],
current_depth=current_depth + 1,
use_oracle=use_oracle,
Expand Down Expand Up @@ -548,7 +536,6 @@ def __init__(
self.use_etc = use_etc
self.etc_max_depth = etc_max_depth
self.etc_max_trees = etc_max_trees
self.use_etc_split = use_etc_split
self.resampler = resampler
self.use_cascade = use_cascade

Expand Down Expand Up @@ -587,7 +574,6 @@ def fit(self, X, y):
use_etc=self.use_etc,
etc_max_depth=self.etc_max_depth,
etc_max_trees=self.etc_max_trees,
use_etc_split=self.use_etc_split,
N=X.shape[0],
current_depth=1,
use_oracle=self.use_oracle,
Expand Down
10 changes: 0 additions & 10 deletions tests/test_landmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,6 @@ def test_models():
BAcc = balanced_accuracy_score(y_test, p)
assert BAcc > 0.7

# Tests the ANN for min samples
clf, _ = ANNClassifier().fit(X_train[0:4], y_train[0:4])
p = clf.predict(X_test)
D = clf.decision_function(X_test)

clf, _ = ETClassifier().fit(X_train, y_train)
p = clf.predict(X_test)
BAcc = balanced_accuracy_score(y_test, p)
Expand All @@ -105,11 +100,6 @@ def test_models():
assert BAcc > 0.7
D = clf.decision_function(X_test)

# Tests the ETC model for min samples
clf, _ = LMClassifier(model_type="lr_l2").fit(X_train[0:4], y_train[0:4])
p = clf.predict(X_test)
D = clf.decision_function(X_test)

clf, _ = LMClassifier(model_type="sgd_l2").fit(X_train, y_train)
p = clf.predict(X_test)
BAcc = balanced_accuracy_score(y_test, p)
Expand Down

0 comments on commit f8a2dba

Please sign in to comment.