Merge pull request #13 from jrudar/Cascade

Preservation of Proximity Information Within LANDMark Trees
jrudar · Jul 12, 2023 · 99f5b05 · 99f5b05
2 parents 4c386c1 + f00efbb
commit 99f5b05
Show file tree

Hide file tree

Showing 15 changed files with 1,389 additions and 275 deletions.
diff --git a/.gitignore b/.gitignore
@@ -108,4 +108,5 @@ ENV/
 .ruff_cache
 /.vs/LANDMark/v16
 /.vs/LANDMark/config
-/.vs
+/.vs
+/notebooks/Untitled.ipynb
diff --git a/LANDMark/LANDMark.py b/LANDMark/LANDMark.py
@@ -12,9 +12,10 @@
 
 from typing import Optional, List
 
+from scipy.sparse import csr_array, issparse
 
-class LANDMarkClassifier(BaseEstimator, ClassifierMixin):
 
+class LANDMarkClassifier(BaseEstimator, ClassifierMixin):
     def __init__(
         self,
         n_estimators: int = 64,
@@ -27,14 +28,16 @@ def __init__(
         use_oracle: bool = True,
         use_lm_l2: bool = True,
         use_lm_l1: bool = True,
+        minority_sz_lm: int = 6,
         use_nnet: bool = True,
         nnet_min_samples: int = 32,
+        minority_sz_nnet: int = 6,
         use_etc: bool = True,
         etc_max_depth: int = 5,
         etc_max_trees: int = 128,
-        resampler = None,
+        resampler=None,
         use_cascade: bool = False,
-        n_jobs: int = 4
+        n_jobs: int = 4,
     ):
         # Tree construction parameters
         self.n_estimators = n_estimators
@@ -47,8 +50,10 @@ def __init__(
         self.use_oracle = use_oracle
         self.use_lm_l2 = use_lm_l2
         self.use_lm_l1 = use_lm_l1
+        self.minority_sz_lm = minority_sz_lm
         self.use_nnet = use_nnet
         self.nnet_min_samples = nnet_min_samples
+        self.minority_sz_nnet = minority_sz_nnet
         self.use_etc = use_etc
         self.etc_max_depth = etc_max_depth
         self.etc_max_trees = etc_max_trees
@@ -85,12 +90,15 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier:
                 use_oracle=self.use_oracle,
                 use_lm_l2=self.use_lm_l2,
                 use_lm_l1=self.use_lm_l1,
+                minority_sz_lm=self.minority_sz_lm,
                 use_nnet=self.use_nnet,
                 nnet_min_samples=self.nnet_min_samples,
+                minority_sz_nnet=self.minority_sz_nnet,
                 use_etc=self.use_etc,
                 etc_max_depth=self.etc_max_depth,
                 etc_max_trees=self.etc_max_trees,
-                resampler=self.resampler
+                resampler=self.resampler,
+                use_cascade=self.use_cascade,
             ),
             n_estimators=self.n_estimators,
             class_names=self.classes_,
@@ -140,23 +148,78 @@ def score(self, X: np.ndarray, y: np.ndarray) -> float:
 
         return score
 
-    def proximity(self, X: np.ndarray) -> np.ndarray:
+    def proximity(self, X: np.ndarray, prox_type: str = "path") -> np.ndarray:
         check_is_fitted(self, attributes=["classes_", "estimators_"])
 
-        tree_mats = []
+        if prox_type == "terminal":
+            tree_mats = []
+
+            for estimator in self.estimators_.estimators_:
+                tree_mats.append(estimator.proximity(X, prox_type))
+
+            emb = np.hstack(tree_mats)
+
+            return csr_array(emb.astype(np.uint8))
+
+        elif prox_type == "path":
+            if hasattr(self, "node_set"):
+                embs = [
+                    est.proximity(X, prox_type) for est in self.estimators_.estimators_
+                ]
+
+                if X.ndim == 1:
+                    emb = np.zeros(shape=(1, len(self.node_set)), dtype=np.uint8)
+                else:
+                    emb = np.zeros(
+                        shape=(X.shape[0], len(self.node_set)), dtype=np.uint8
+                    )
+
+                for tree_emb in embs:
+                    for sample, nodes in tree_emb.items():
+                        for node in nodes:
+                            emb[sample, self.node_set[node]] = 1
+
+                return csr_array(emb)
 
-        for estimator in self.estimators_.estimators_:
-            tree_mats.append(estimator.proximity(X))
+            else:
+                # Get the list of nodes associated with each sample in X
+                embs = [
+                    est.proximity(X, prox_type) for est in self.estimators_.estimators_
+                ]
 
-        emb = np.hstack(tree_mats)
+                # Create a list of all nodes across all trees in the forest
+                node_set = set()
+                [node_set.update(est.all_nodes) for est in self.estimators_.estimators_]
 
-        return emb
+                node_set = list(node_set)
 
-    def _check_params(self, X: np.ndarray, y: np.ndarray) -> List[np.ndarray, np.ndarray]:
+                # Create the embedding matrix
+                emb = np.zeros(shape=(X.shape[0], len(node_set)), dtype=np.uint8)
+
+                # Create a mapping between node id and index in the embedding matrix
+                self.node_set = {node: i for i, node in enumerate(node_set)}
+
+                # Update the embedding matrix
+                for tree_emb in embs:
+                    for sample, nodes in tree_emb.items():
+                        for node in nodes:
+                            emb[sample, self.node_set[node]] = 1
+
+                return csr_array(emb)
+
+    def _check_params(
+        self, X: np.ndarray, y: np.ndarray
+    ) -> List[np.ndarray, np.ndarray]:
         SUPPORTED_IMPURITY = {"gain", "gain-ratio", "tsallis", "tsallis-gain-ratio"}
 
         # Check that X and y meet the minimum requirements
-        X_conv, y_conv = check_X_y(X, y, accept_sparse=False)
+        X_conv, y_conv = check_X_y(X, y, accept_sparse=True)
+
+        if not issparse(X_conv):
+            sparsity = 1.0 - (np.count_nonzero(X_conv) / X_conv.size)
+
+            if sparsity >= 0.9:
+                X_conv = csr_array(X_conv)
 
         if not isinstance(self.n_estimators, int):
             raise TypeError("'n_estimators' must be an integer.")
@@ -174,9 +237,11 @@ def _check_params(self, X: np.ndarray, y: np.ndarray) -> List[np.ndarray, np.nda
 
         if isinstance(self.max_depth, type(None)):
             pass
+
         elif isinstance(self.max_depth, int):
             if self.max_depth <= 0:
                 raise ValueError("'max_depth' must be an greater than zero.")
+
         else:
             raise TypeError("'max_depth' must be an integer greater than zero or None.")
 
@@ -192,6 +257,7 @@ def _check_params(self, X: np.ndarray, y: np.ndarray) -> List[np.ndarray, np.nda
         if isinstance(self.min_gain, float):
             if self.min_gain < 0:
                 raise ValueError("'min_gain' must be greater than or equal to zero.")
+
         else:
             raise TypeError("'min_gain' must be float.")
 
@@ -233,7 +299,7 @@ def _check_params(self, X: np.ndarray, y: np.ndarray) -> List[np.ndarray, np.nda
 
         if not isinstance(self.use_etc, bool):
             raise TypeError("'use_etc' must be True or False.")
-            
+
         if isinstance(self.etc_max_depth, int):
             if self.etc_max_depth <= 0:
                 raise ValueError("'etc_max_depth' must be greater than zero.")
@@ -259,7 +325,7 @@ def _check_params(self, X: np.ndarray, y: np.ndarray) -> List[np.ndarray, np.nda
         if isinstance(self.resampler, type(None)):
             pass
 
-        elif hasattr(self.resampler, "fit_transform") == False:
+        elif hasattr(self.resampler, "fit_transform") is False:
             raise ValueError("'resampler' must have a 'fit_transform(X, y)' function.")
 
         return X_conv, y_conv
diff --git a/LANDMark/lm_dtree_clfs.py b/LANDMark/lm_dtree_clfs.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 
+
 class ETClassifier(ClassifierMixin, BaseEstimator):
     def __init__(self, n_feat=0.8, max_depth=5, max_trees=128):
         self.n_feat = n_feat
@@ -29,26 +30,23 @@ def fit(self, X, y):
 
         self.classes_, y_counts = np.unique(y_re, return_counts=True)
 
-        clf_1 = ExtraTreesClassifier(
+        clf = ExtraTreesClassifier(
             n_estimators=self.max_trees, max_depth=self.max_depth
         )
 
         self.model_type = "nonlinear_etc"
 
-        self.clf_model = clf_1.fit(X_re, y_re)
+        self.clf_model = clf.fit(X_re, y_re)
 
         return self, self.decision_function(X)
 
     def predict(self, X):
         return self.clf_model.predict(X[:, self.features])
 
     def predict_proba(self, X):
-
         return self.clf_model.predict_proba(X[:, self.features])
 
     def decision_function(self, X):
         D = self.clf_model.predict_proba(X[:, self.features])
 
         return np.where(D > 0.5, 1, -1)
-
-
diff --git a/LANDMark/lm_linear_clfs.py b/LANDMark/lm_linear_clfs.py
@@ -1,6 +1,3 @@
-import logging
-import os
-
 import warnings
 from sklearn.exceptions import ConvergenceWarning
 
@@ -12,24 +9,21 @@
 from sklearn.linear_model import (
     RidgeClassifierCV,
     LogisticRegressionCV,
-    LogisticRegression,
     SGDClassifier,
-    RidgeClassifier,
 )
 from sklearn.svm import LinearSVC
-from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.model_selection import GridSearchCV
 from sklearn.utils import resample
-
-from random import choice
+from sklearn.model_selection import StratifiedKFold
 
 from math import ceil
 
 
 class LMClassifier(ClassifierMixin, BaseEstimator):
-    def __init__(self, model_type, n_feat=0.8):
+    def __init__(self, model_type, n_feat=0.8, minority=6, use_etc_split=True):
         self.model_type = model_type
         self.n_feat = n_feat
+        self.minority = minority
 
     def fit(self, X, y):
         if X.shape[1] >= 4:
@@ -48,19 +42,21 @@ def fit(self, X, y):
 
         self.classes_, y_counts = np.unique(y_re, return_counts=True)
 
-        self.y_min = min(y_counts)
-        
-        if self.y_min > 6:
+        self.y_min = min(y_counts) * 0.8
+
+        if self.y_min > self.minority:
             if self.model_type == "lr_l2":
-                self.clf = LogisticRegressionCV(max_iter=2000, cv=5).fit(X_re, y_re)
+                self.clf = LogisticRegressionCV(
+                    max_iter=2000, cv=StratifiedKFold(5)
+                ).fit(X_re, y_re)
 
             elif self.model_type == "lr_l1":
                 solver = "liblinear"
                 if X.shape[0] >= 500:
                     solver = "saga"
 
                 self.clf = LogisticRegressionCV(
-                    max_iter=2000, cv=5, solver=solver, penalty="l1"
+                    max_iter=2000, cv=StratifiedKFold(5), solver=solver, penalty="l1"
                 ).fit(X_re, y_re)
 
             elif self.model_type == "sgd_l2":
@@ -70,7 +66,7 @@ def fit(self, X, y):
                         "alpha": [0.001, 0.01, 0.1, 1.0, 10, 100],
                         "loss": ["hinge", "modified_huber"],
                     },
-                    cv=5,
+                    cv=StratifiedKFold(5),
                 ).fit(X_re, y_re)
 
                 self.clf = self.cv.best_estimator_
@@ -82,41 +78,34 @@ def fit(self, X, y):
                         "alpha": [0.001, 0.01, 0.1, 1.0, 10, 100],
                         "loss": ["hinge", "modified_huber"],
                     },
-                    cv=5,
+                    cv=StratifiedKFold(5),
                 ).fit(X_re, y_re)
 
                 self.clf = self.cv.best_estimator_
 
             elif self.model_type == "ridge":
                 self.clf = RidgeClassifierCV(
-                    alphas=(0.001, 0.01, 0.1, 1.0, 10, 100, 1000), cv=5
+                    alphas=(0.001, 0.01, 0.1, 1.0, 10, 100, 1000), cv=StratifiedKFold(5)
                 ).fit(X_re, y_re)
 
             elif self.model_type == "lsvc":
                 self.cv = GridSearchCV(
                     LinearSVC(max_iter=2000),
                     param_grid={"C": [0.001, 0.01, 0.1, 1.0, 10, 100]},
-                    cv=5,
+                    cv=StratifiedKFold(5),
                 ).fit(X_re, y_re)
 
                 self.clf = self.cv.best_estimator_
 
-        else:
-            self.clf = ExtraTreesClassifier(n_estimators = 128, max_depth = 1)
-
-            self.clf.fit(X_re, y_re)
+            return self, self.decision_function(X)
 
-        return self, self.decision_function(X)
+        # Otherwise use an Extra Trees Classifier or Nothing
+        else:
+            return self, None
 
     def predict(self, X):
         return self.clf.predict(X[:, self.features])
 
     def decision_function(self, X):
-
-        if self.y_min > 6:
-            return self.clf.decision_function(X[:, self.features])
-
-        else:
-            D = self.clf.predict_proba(X[:, self.features])
+        return self.clf.decision_function(X[:, self.features])
 
-            return np.where(D > 0.5, 1, -1)