Add files via upload

jrudar · May 19, 2023 · 93106db · 93106db
1 parent 89abd33
commit 93106db
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 28 deletions.
diff --git a/LANDMark/LANDMark.py b/LANDMark/LANDMark.py
@@ -18,6 +18,7 @@ def __init__(
         max_features: float = 0.80,
         min_gain: float = 0.0,
         impurity: str = "gain",
+        q: float = 1.5,
         use_oracle: bool = True,
         use_lm_l2: bool = True,
         use_lm_l1: bool = True,
@@ -26,7 +27,7 @@ def __init__(
         use_etc: bool = True,
         etc_max_depth: int = 5,
         etc_max_trees: int = 128,
-        max_samples_tree: int = -1,
+        resampler = None,
         bootstrap: bool = False,
         n_jobs: int = 4,
     ):
@@ -37,6 +38,7 @@ def __init__(
         self.max_features = max_features
         self.min_gain = min_gain
         self.impurity = impurity
+        self.q = q
         self.use_oracle = use_oracle
         self.use_lm_l2 = use_lm_l2
         self.use_lm_l1 = use_lm_l1
@@ -45,7 +47,7 @@ def __init__(
         self.use_etc = use_etc
         self.etc_max_depth = etc_max_depth
         self.etc_max_trees = etc_max_trees
-        self.max_samples_tree = max_samples_tree
+        self.resampler = resampler
         self.bootstrap = bootstrap
 
         self.n_jobs = n_jobs
@@ -72,6 +74,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier:
                 max_features=self.max_features,
                 min_gain=self.min_gain,
                 impurity=self.impurity,
+                q = self.q,
                 use_oracle=self.use_oracle,
                 bootstrap=self.bootstrap,
                 use_lm_l2=self.use_lm_l2,
@@ -82,7 +85,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier:
                 etc_max_depth=self.etc_max_depth,
                 etc_max_trees=self.etc_max_trees,
             ),
-            max_samples_tree=self.max_samples_tree,
+            resampler=self.resampler,
             n_estimators=self.n_estimators,
             class_names=self.classes_,
             n_jobs=self.n_jobs,

diff --git a/LANDMark/tree.py b/LANDMark/tree.py
@@ -16,23 +16,78 @@
 )
 
 
-def purity_function(N, N_lab, L, R, y, purity_fun="gain"):
-    # Calculate Information Gain
-    if purity_fun == "gain":
-        L_outcome, L_counts = np.unique(y[L], return_counts=True)
-        L_prob = L_counts / L_counts.sum()
-        H_L = entropy(L_prob) * (L_counts.sum() / N)
+def tsallis_fun(N, N_lab, L, R, y, mode, q):
 
-        R_outcome, R_counts = np.unique(y[R], return_counts=True)
-        R_prob = R_counts / R_counts.sum()
-        H_R = entropy(R_prob) * (R_counts.sum() / N)
+    if q == 1: #Special case
+        if "ratio" in mode.split("-"):
+            return entropy_fun(N, N_lab, L, R, y, "gain-ratio")
 
-        H_parent = entropy(N_lab)
+        else:
+            return entropy_fun(N, N_lab, L, R, y, "gain")
+
+    scaler = 1 / (1 - q)
+
+    L_outcome, L_counts = np.unique(y[L], return_counts=True)
+    L_prob = L_counts / L_counts.sum()
+    H_L = (L_counts.sum() / N) * (scaler * (np.power(L_prob, q).sum() - 1))
+
+    R_outcome, R_counts = np.unique(y[R], return_counts=True)
+    R_prob = R_counts / R_counts.sum()
+    H_R = (R_counts.sum() / N) * (scaler * (np.power(R_prob, q).sum() - 1))
+
+    H_parent = scaler * (np.power(N_lab, q).sum() - 1)
+
+    IG = H_parent - H_R - H_L
 
-        IG = H_parent - H_L - H_R
+    if mode == "tsallis":
 
         return IG
 
+    else:
+        norm_factor = np.asarray([(L_counts.sum() / N), (R_counts.sum() / N)])
+        norm_factor = 1 + (scaler * (np.power(norm_factor, q).sum() - 1))
+
+        GR = IG / norm_factor
+
+        return GR
+
+
+def entropy_fun(N, N_lab, L, R, y, mode):
+    L_outcome, L_counts = np.unique(y[L], return_counts=True)
+    L_prob = L_counts / L_counts.sum()
+    H_L = entropy(L_prob) * (L_counts.sum() / N)
+
+    R_outcome, R_counts = np.unique(y[R], return_counts=True)
+    R_prob = R_counts / R_counts.sum()
+    H_R = entropy(R_prob) * (R_counts.sum() / N)
+
+    H_parent = entropy(N_lab)
+
+    IG = H_parent - H_L - H_R
+
+    if mode == "gain":
+
+        return IG
+
+    else:
+        norm_factor = np.asarray([(L_counts.sum() / N), (R_counts.sum() / N)])
+        norm_factor = 1 + entropy(norm_factor)
+
+        GR = IG / norm_factor
+
+        return GR
+
+
+def purity_function(N, N_lab, L, R, y, purity_fun, q):
+
+    if purity_fun == "gain" or purity_fun == "gain-ratio":
+
+        return entropy_fun(N, N_lab, L, R, y, purity_fun)
+
+    elif purity_fun == "tsallis-gain-ratio" or purity_fun == "tsallis":
+
+        return tsallis_fun(N, N_lab, L, R, y, purity_fun, q)
+
 
 class PredictData:
     def __init__(self, node_lab):
@@ -72,6 +127,7 @@ def get_split(self,
                   max_features,
                   min_gain,
                   impurity,
+                  q,
                   use_lm_l2,
                   use_lm_l1,
                   use_nnet,
@@ -123,7 +179,7 @@ def get_split(self,
                 L = np.where(D > 0, True, False)
                 R = np.where(D <= 0, True, False)
 
-                IG = purity_function(counts_sum, counts_prob, L, R, y, impurity)
+                IG = purity_function(counts_sum, counts_prob, L, R, y, impurity, q)
 
                 self.gain = IG
 
@@ -135,6 +191,7 @@ def get_split(self,
                                                  max_features = max_features,
                                                  min_gain = min_gain,
                                                  impurity = impurity,
+                                                 q = q,
                                                  use_lm_l2 = use_lm_l2,
                                                  use_lm_l1 = use_lm_l1,
                                                  use_nnet = use_nnet,
@@ -153,6 +210,7 @@ def get_split(self,
                                                   max_features = max_features,
                                                   min_gain = min_gain,
                                                   impurity = impurity,
+                                                  q = q,
                                                   use_lm_l2 = use_lm_l2,
                                                   use_lm_l1 = use_lm_l1,
                                                   use_nnet = use_nnet,
@@ -192,7 +250,7 @@ def get_split(self,
                         # Calculate Information Gain
                         if X_L_n > 0 and X_R_n > 0:
                             IG = purity_function(
-                                counts_sum, counts_prob, L, R, y, impurity
+                                counts_sum, counts_prob, L, R, y, impurity, q
                             )
 
                             gains.append(IG)
@@ -219,7 +277,7 @@ def get_split(self,
                         # Calculate Information Gain
                         if X_L_n > 0 and X_R_n > 0:
                             IG = purity_function(
-                                counts_sum, counts_prob, L, R, y, impurity
+                                counts_sum, counts_prob, L, R, y, impurity, q
                             )
 
                             gains.append(IG)
@@ -244,7 +302,7 @@ def get_split(self,
                             # Calculate Information Gain
                             if X_L_n > 0 and X_R_n > 0:
                                 IG = purity_function(
-                                    counts_sum, counts_prob, L, R, y, impurity
+                                    counts_sum, counts_prob, L, R, y, impurity, q
                                 )
 
                                 gains.append(IG)
@@ -274,7 +332,7 @@ def get_split(self,
                         # Calculate Information Gain
                         if X_L_n > 0 and X_R_n > 0:
                             IG = purity_function(
-                                counts_sum, counts_prob, L, R, y, impurity
+                                counts_sum, counts_prob, L, R, y, impurity, q
                             )
 
                             gains.append(IG)
@@ -312,6 +370,7 @@ def get_split(self,
                                                  max_features = max_features,
                                                  min_gain = min_gain,
                                                  impurity = impurity,
+                                                 q = q,
                                                  use_lm_l2 = use_lm_l2,
                                                  use_lm_l1 = use_lm_l1,
                                                  use_nnet = use_nnet,
@@ -330,6 +389,7 @@ def get_split(self,
                                                   max_features = max_features,
                                                   min_gain = min_gain,
                                                   impurity = impurity,
+                                                  q = q,
                                                   use_lm_l2 = use_lm_l2,
                                                   use_lm_l1 = use_lm_l1,
                                                   use_nnet = use_nnet,
@@ -361,6 +421,7 @@ def __init__(
         max_features,
         min_gain,
         impurity,
+        q,
         use_oracle,
         bootstrap,
         use_lm_l2,
@@ -376,6 +437,7 @@ def __init__(
         self.max_features = max_features
         self.min_gain = min_gain
         self.impurity = impurity
+        self.q = q
         self.use_oracle = use_oracle
         self.bootstrap = bootstrap
         self.use_lm_l2 = use_lm_l2
@@ -408,6 +470,7 @@ def fit(self, X, y):
                        max_features = self.max_features,
                        min_gain = self.min_gain,
                        impurity = self.impurity,
+                       q = self.q,
                        use_lm_l2 = self.use_lm_l2,
                        use_lm_l1 = self.use_lm_l1,
                        use_nnet = self.use_nnet,

diff --git a/LANDMark/utils.py b/LANDMark/utils.py
@@ -3,20 +3,17 @@
 ##########################################################################################
 # For Bagging Classifier
 from sklearn.base import ClassifierMixin, BaseEstimator, clone
-from sklearn.utils import resample
 from scipy.special import softmax
 from joblib import Parallel, delayed, parallel_backend
 
 
-def _parallel_build(estimator, X, y, max_samples_tree):
-    if X.shape[0] <= max_samples_tree or max_samples_tree == -1:
+def _parallel_build(estimator, X, y, resampler):
+    if isinstance(resampler, type(None)):
         X_trf = X
         y_trf = y
 
     else:
-        X_trf, y_trf = resample(
-            X, y, replace=True, n_samples=max_samples_tree, stratify=y
-        )
+        X_trf, y_trf = clone(resampler).fit_resample(X, y)
 
     trained_estimator = estimator.fit(X_trf, y_trf)
 
@@ -25,10 +22,10 @@ def _parallel_build(estimator, X, y, max_samples_tree):
 
 class Ensemble(ClassifierMixin, BaseEstimator):
     def __init__(
-        self, base_estimator, max_samples_tree, n_estimators, class_names, n_jobs
+        self, base_estimator, resampler, n_estimators, class_names, n_jobs
     ):
         self.base_estimator = base_estimator
-        self.max_samples_tree = max_samples_tree
+        self.resampler = resampler
         self.n_estimators = n_estimators
         self.classes_ = class_names
         self.n_jobs = n_jobs
@@ -37,7 +34,7 @@ def fit(self, X, y):
 
         self.estimators_ = Parallel(n_jobs=self.n_jobs)(
             delayed(_parallel_build)(
-                clone(self.base_estimator), X, y, self.max_samples_tree
+                clone(self.base_estimator), X, y, self.resampler
             )
             for i in range(self.n_estimators)
         )
@@ -98,6 +95,7 @@ def predict_proba(self, X):
 
 
 ##########################################################################################
+# For Neural Network Models
 import tensorflow as tf
 import keras.backend as K