From 93106dbcc6f874c4ce29af8e7b685fb3569aa6a5 Mon Sep 17 00:00:00 2001 From: jrudar <37217429+jrudar@users.noreply.github.com> Date: Fri, 19 May 2023 13:57:34 -0400 Subject: [PATCH] Add files via upload --- LANDMark/LANDMark.py | 9 +++-- LANDMark/tree.py | 95 ++++++++++++++++++++++++++++++++++++-------- LANDMark/utils.py | 16 ++++---- 3 files changed, 92 insertions(+), 28 deletions(-) diff --git a/LANDMark/LANDMark.py b/LANDMark/LANDMark.py index 75690b7..0305006 100644 --- a/LANDMark/LANDMark.py +++ b/LANDMark/LANDMark.py @@ -18,6 +18,7 @@ def __init__( max_features: float = 0.80, min_gain: float = 0.0, impurity: str = "gain", + q: float = 1.5, use_oracle: bool = True, use_lm_l2: bool = True, use_lm_l1: bool = True, @@ -26,7 +27,7 @@ def __init__( use_etc: bool = True, etc_max_depth: int = 5, etc_max_trees: int = 128, - max_samples_tree: int = -1, + resampler = None, bootstrap: bool = False, n_jobs: int = 4, ): @@ -37,6 +38,7 @@ def __init__( self.max_features = max_features self.min_gain = min_gain self.impurity = impurity + self.q = q self.use_oracle = use_oracle self.use_lm_l2 = use_lm_l2 self.use_lm_l1 = use_lm_l1 @@ -45,7 +47,7 @@ def __init__( self.use_etc = use_etc self.etc_max_depth = etc_max_depth self.etc_max_trees = etc_max_trees - self.max_samples_tree = max_samples_tree + self.resampler = resampler self.bootstrap = bootstrap self.n_jobs = n_jobs @@ -72,6 +74,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier: max_features=self.max_features, min_gain=self.min_gain, impurity=self.impurity, + q = self.q, use_oracle=self.use_oracle, bootstrap=self.bootstrap, use_lm_l2=self.use_lm_l2, @@ -82,7 +85,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> LANDMarkClassifier: etc_max_depth=self.etc_max_depth, etc_max_trees=self.etc_max_trees, ), - max_samples_tree=self.max_samples_tree, + resampler=self.resampler, n_estimators=self.n_estimators, class_names=self.classes_, n_jobs=self.n_jobs, diff --git a/LANDMark/tree.py b/LANDMark/tree.py index 6cc67a2..2c0f1c6 100644 --- a/LANDMark/tree.py +++ b/LANDMark/tree.py @@ -16,23 +16,78 @@ ) -def purity_function(N, N_lab, L, R, y, purity_fun="gain"): - # Calculate Information Gain - if purity_fun == "gain": - L_outcome, L_counts = np.unique(y[L], return_counts=True) - L_prob = L_counts / L_counts.sum() - H_L = entropy(L_prob) * (L_counts.sum() / N) +def tsallis_fun(N, N_lab, L, R, y, mode, q): - R_outcome, R_counts = np.unique(y[R], return_counts=True) - R_prob = R_counts / R_counts.sum() - H_R = entropy(R_prob) * (R_counts.sum() / N) + if q == 1: #Special case + if "ratio" in mode.split("-"): + return entropy_fun(N, N_lab, L, R, y, "gain-ratio") - H_parent = entropy(N_lab) + else: + return entropy_fun(N, N_lab, L, R, y, "gain") + + scaler = 1 / (1 - q) + + L_outcome, L_counts = np.unique(y[L], return_counts=True) + L_prob = L_counts / L_counts.sum() + H_L = (L_counts.sum() / N) * (scaler * (np.power(L_prob, q).sum() - 1)) + + R_outcome, R_counts = np.unique(y[R], return_counts=True) + R_prob = R_counts / R_counts.sum() + H_R = (R_counts.sum() / N) * (scaler * (np.power(R_prob, q).sum() - 1)) + + H_parent = scaler * (np.power(N_lab, q).sum() - 1) + + IG = H_parent - H_R - H_L - IG = H_parent - H_L - H_R + if mode == "tsallis": return IG + else: + norm_factor = np.asarray([(L_counts.sum() / N), (R_counts.sum() / N)]) + norm_factor = 1 + (scaler * (np.power(norm_factor, q).sum() - 1)) + + GR = IG / norm_factor + + return GR + + +def entropy_fun(N, N_lab, L, R, y, mode): + L_outcome, L_counts = np.unique(y[L], return_counts=True) + L_prob = L_counts / L_counts.sum() + H_L = entropy(L_prob) * (L_counts.sum() / N) + + R_outcome, R_counts = np.unique(y[R], return_counts=True) + R_prob = R_counts / R_counts.sum() + H_R = entropy(R_prob) * (R_counts.sum() / N) + + H_parent = entropy(N_lab) + + IG = H_parent - H_L - H_R + + if mode == "gain": + + return IG + + else: + norm_factor = np.asarray([(L_counts.sum() / N), (R_counts.sum() / N)]) + norm_factor = 1 + entropy(norm_factor) + + GR = IG / norm_factor + + return GR + + +def purity_function(N, N_lab, L, R, y, purity_fun, q): + + if purity_fun == "gain" or purity_fun == "gain-ratio": + + return entropy_fun(N, N_lab, L, R, y, purity_fun) + + elif purity_fun == "tsallis-gain-ratio" or purity_fun == "tsallis": + + return tsallis_fun(N, N_lab, L, R, y, purity_fun, q) + class PredictData: def __init__(self, node_lab): @@ -72,6 +127,7 @@ def get_split(self, max_features, min_gain, impurity, + q, use_lm_l2, use_lm_l1, use_nnet, @@ -123,7 +179,7 @@ def get_split(self, L = np.where(D > 0, True, False) R = np.where(D <= 0, True, False) - IG = purity_function(counts_sum, counts_prob, L, R, y, impurity) + IG = purity_function(counts_sum, counts_prob, L, R, y, impurity, q) self.gain = IG @@ -135,6 +191,7 @@ def get_split(self, max_features = max_features, min_gain = min_gain, impurity = impurity, + q = q, use_lm_l2 = use_lm_l2, use_lm_l1 = use_lm_l1, use_nnet = use_nnet, @@ -153,6 +210,7 @@ def get_split(self, max_features = max_features, min_gain = min_gain, impurity = impurity, + q = q, use_lm_l2 = use_lm_l2, use_lm_l1 = use_lm_l1, use_nnet = use_nnet, @@ -192,7 +250,7 @@ def get_split(self, # Calculate Information Gain if X_L_n > 0 and X_R_n > 0: IG = purity_function( - counts_sum, counts_prob, L, R, y, impurity + counts_sum, counts_prob, L, R, y, impurity, q ) gains.append(IG) @@ -219,7 +277,7 @@ def get_split(self, # Calculate Information Gain if X_L_n > 0 and X_R_n > 0: IG = purity_function( - counts_sum, counts_prob, L, R, y, impurity + counts_sum, counts_prob, L, R, y, impurity, q ) gains.append(IG) @@ -244,7 +302,7 @@ def get_split(self, # Calculate Information Gain if X_L_n > 0 and X_R_n > 0: IG = purity_function( - counts_sum, counts_prob, L, R, y, impurity + counts_sum, counts_prob, L, R, y, impurity, q ) gains.append(IG) @@ -274,7 +332,7 @@ def get_split(self, # Calculate Information Gain if X_L_n > 0 and X_R_n > 0: IG = purity_function( - counts_sum, counts_prob, L, R, y, impurity + counts_sum, counts_prob, L, R, y, impurity, q ) gains.append(IG) @@ -312,6 +370,7 @@ def get_split(self, max_features = max_features, min_gain = min_gain, impurity = impurity, + q = q, use_lm_l2 = use_lm_l2, use_lm_l1 = use_lm_l1, use_nnet = use_nnet, @@ -330,6 +389,7 @@ def get_split(self, max_features = max_features, min_gain = min_gain, impurity = impurity, + q = q, use_lm_l2 = use_lm_l2, use_lm_l1 = use_lm_l1, use_nnet = use_nnet, @@ -361,6 +421,7 @@ def __init__( max_features, min_gain, impurity, + q, use_oracle, bootstrap, use_lm_l2, @@ -376,6 +437,7 @@ def __init__( self.max_features = max_features self.min_gain = min_gain self.impurity = impurity + self.q = q self.use_oracle = use_oracle self.bootstrap = bootstrap self.use_lm_l2 = use_lm_l2 @@ -408,6 +470,7 @@ def fit(self, X, y): max_features = self.max_features, min_gain = self.min_gain, impurity = self.impurity, + q = self.q, use_lm_l2 = self.use_lm_l2, use_lm_l1 = self.use_lm_l1, use_nnet = self.use_nnet, diff --git a/LANDMark/utils.py b/LANDMark/utils.py index 79c337b..2336264 100644 --- a/LANDMark/utils.py +++ b/LANDMark/utils.py @@ -3,20 +3,17 @@ ########################################################################################## # For Bagging Classifier from sklearn.base import ClassifierMixin, BaseEstimator, clone -from sklearn.utils import resample from scipy.special import softmax from joblib import Parallel, delayed, parallel_backend -def _parallel_build(estimator, X, y, max_samples_tree): - if X.shape[0] <= max_samples_tree or max_samples_tree == -1: +def _parallel_build(estimator, X, y, resampler): + if isinstance(resampler, type(None)): X_trf = X y_trf = y else: - X_trf, y_trf = resample( - X, y, replace=True, n_samples=max_samples_tree, stratify=y - ) + X_trf, y_trf = clone(resampler).fit_resample(X, y) trained_estimator = estimator.fit(X_trf, y_trf) @@ -25,10 +22,10 @@ def _parallel_build(estimator, X, y, max_samples_tree): class Ensemble(ClassifierMixin, BaseEstimator): def __init__( - self, base_estimator, max_samples_tree, n_estimators, class_names, n_jobs + self, base_estimator, resampler, n_estimators, class_names, n_jobs ): self.base_estimator = base_estimator - self.max_samples_tree = max_samples_tree + self.resampler = resampler self.n_estimators = n_estimators self.classes_ = class_names self.n_jobs = n_jobs @@ -37,7 +34,7 @@ def fit(self, X, y): self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_build)( - clone(self.base_estimator), X, y, self.max_samples_tree + clone(self.base_estimator), X, y, self.resampler ) for i in range(self.n_estimators) ) @@ -98,6 +95,7 @@ def predict_proba(self, X): ########################################################################################## +# For Neural Network Models import tensorflow as tf import keras.backend as K