In [1]:
from aeon.datasets import load_italy_power_demand
X, y = load_italy_power_demand(return_type='numpy2d')

import numpy as np
from aeon.distances import distance

In [2]:
import numpy as np

from aeon.classification.base import BaseClassifier

from aeon.distances import distance

class Node():

    def __init__(
            self,
            node_id,
            _is_leaf,
            label,
            splitter=None,
    ):
        self.node_id = node_id
        self._is_leaf = _is_leaf
        self.label = label
        self.splitter = splitter
        self.children = {}

class ProximityTree(BaseClassifier):

    def __init__(
        self,
        n_splitters: int = 5,
        max_depth: int = None,
        min_samples_split: int = 2,
        random_state: int = 0,
        n_jobs: int = 1,
        verbose: int = 0,
    ) -> None:
        self.n_splitter = n_splitters
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.verbose = verbose
        super().__init__()

    def get_parameter_value(self, X=None):
        """Generate random parameter values.

        For a list of distance measures, generate a dictionary
        of parameterized distances.

        Parameters
        ----------
        X : np.ndarray of shape (n_cases, n_timepoints)

        Returns
        -------
        distance_param : a dictionary of distances and their
        parameters.
        """
        X_std = X.std()
        param_ranges = {
            "euclidean": {},
            "dtw": {"window": (0, 0.25)},
            "ddtw": {"window": (0, 0.25)},
            "wdtw": {"g": (0, 1)},
            "wddtw": {"g": (0, 1)},
            "erp": {"g": (X_std / 5, X_std)},
            "lcss": {"epsilon": (X_std / 5, X_std), "window": (0, 0.25)},
        }
        random_params = {}
        for measure, ranges in param_ranges.items():
            random_params[measure] = {
                param: np.round(np.random.uniform(low, high), 3)
                for param, (low, high) in ranges.items()
            }
        # For TWE
        lmbda = np.random.randint(0, 9)
        exponent_range = np.arange(1, 6)  # Exponents from -5 to 1 (inclusive)
        random_exponent = np.random.choice(exponent_range)
        nu = 1 / 10**random_exponent
        random_params["twe"] = {"lmbda": lmbda, "nu": nu}

        # For MSM
        base = 10
        # Exponents from -2 to 2 (inclusive)
        exponents = np.arange(-2, 3, dtype=np.float64)
        # Randomly select an index from the exponent range
        random_index = np.random.randint(0, len(exponents))
        c = base ** exponents[random_index]
        random_params["msm"] = {"c": c}

        return random_params

    def get_candidate_splitter(self, X, y):
        """Generate candidate splitter.

        Takes a time series dataset and a set of parameterized
        distance measures to create a candidate splitter, which
        contains a parameterized distance measure and a set of exemplars.

        Parameters
        ----------
        X : np.ndarray shape (n_cases, n_timepoints)
            The training input samples.
        y : np.array shape (n_cases,) or (n_cases,1)
        parameterized_distances : dictionary
            Contains the distances and their parameters.

        Returns
        -------
        splitter : list of two dictionaries
            A distance and its parameter values and a set of exemplars.
        """
        _X = X
        _y = y

        exemplars = {}
        for label in np.unique(_y):
            y_new = _y[_y == label]
            X_new = _X[_y == label]
            id = np.random.randint(0, X_new.shape[0])
            exemplars[y_new[id]] = X_new[id, :]

        # Create a list with first element exemplars and second element a
        # random parameterized distance measure
        parameterized_distances = self.get_parameter_value(X)
        n = np.random.randint(0, 9)
        dist = list(parameterized_distances.keys())[n]
        splitter = [exemplars, {dist: parameterized_distances[dist]}]

        return splitter

    def gini(self, y):
        """Get gini score at a specific node.

        Parameters
        ----------
        y : 1d numpy array
            array of class labels

        Returns
        -------
        score : float
            gini score for the set of class labels (i.e. how pure they are). A
            larger score means more impurity. Zero means
            pure.
        """
        # get number instances at node
        n_instances = y.shape[0]
        if n_instances > 0:
            # count each class
            unique_class_labels, class_counts = np.unique(y, return_counts=True)
            # subtract class entropy from current score for each class
            class_counts = np.divide(class_counts, n_instances)
            class_counts = np.power(class_counts, 2)
            sum = np.sum(class_counts)
            return 1 - sum
        else:
            # y is empty, therefore considered pure
            raise ValueError("y empty")

    def gini_gain(self, y, y_subs):
        """Get gini score of a split, i.e. the gain from parent to children.

        Parameters
        ----------
        y : 1d array
            array of class labels at parent
        y_subs : list of 1d array like
            list of array of class labels, one array per child

        Returns
        -------
        score : float
            gini score of the split from parent class labels to children. Note a
            higher score means better gain,
            i.e. a better split
        """
        if y.ndim != 1:
            raise ValueError()
        # find number of instances overall
        parent_n_instances = y.shape[0]
        # if parent has no instances then is pure
        if parent_n_instances == 0:
            for child in y_subs:
                if len(child) > 0:
                    raise ValueError("children populated but parent empty")
            return 0.5
        # find gini for parent node
        score = self.gini(y)
        # sum the children's gini scores
        for index in range(len(y_subs)):
            child_class_labels = y_subs[index]
            # ignore empty children
            if len(child_class_labels) > 0:
                # find gini score for this child
                child_score = self.gini(child_class_labels)
                # weight score by proportion of instances at child compared to
                # parent
                child_size = len(child_class_labels)
                child_score *= child_size / parent_n_instances
                # add to cumulative sum
                score -= child_score
        return score

    def _build_tree(X,y,depth,node_id,parent_target_value=None):

        # If the data reaching the node is empty
        if (len(X)==0):
            leaf_label = parent_plurality_value
            leaf = Node(node_id=node_id, _is_leaf=True, label=leaf_label)

        # Target value in current node
        initial_target_value = _find_trget_value(X,y)
        
        # If max depth is reached
        if ((max_depth is not None)&(depth>=max_depth)):
            leaf_label = initial_target_value
            leaf = Node(node_id = node_id, _is_leaf = True, label = leaf_label)
        
        # Pure node
        if (len(np.unique(y))==1):
            leaf_label = initial_target_value
            leaf = Node(node_id = node_id, _is_leaf = True, label = leaf_label)
            return leaf

        # Find the best splitter
        splitter = self.get_best_splitter(X,y, n_splitters)

        node = Node(node_id = node_id, _is_leaf=False, splitter = splitter)

    def _find_target_value(y):
        """Get the class label of highest frequency."""
        unique, counts = np.unique(y, return_counts=True)
        # Find the index of the maximum count
        max_index = np.argmax(counts)
        mode_value = unique[max_index]
        mode_count = counts[max_index]
        return mode_value

    def get_best_splitter(self, X, y, n_splitters):
        max_gain = float('-inf')
        best_splitter = None
        #For quality analysis
        gains = []
        splitters = []
        for k in range(n_splitters):
            splitter = self.get_candidate_splitter(X,y)
            exemplar = splitter[0]
            measure = list(splitter[1].keys())[0]
            labels = list(exemplar.keys())
            y_subs = [[] for i in range(len(labels))]
            for i in range(X.shape[0]):
                min_dist = float('inf')
                label=None
                for j in range(len(labels)):
                    dist = distance(X[i],exemplar[labels[j]], metric=measure, kwargs=splitter[1][distance])
                    if (dist<min_dist):
                        min_dist = dist
                        label = j
                y_subs[j].append(y[i])
            y_subs = [np.array(ele) for ele in y_subs]
            gini_index = gini_gain(y,y_subs)
            gains.append(gini_index)
            splitters.append(splitter)
            if (gini_index > max_gain):
                max_gain = gini_index
                best_splitter = splitter
        return best_splitter, gains, splitters
            
    
    def _fit(self, X, y):
        self.root = self._build_tree(X,y, depth=0, node_id='0', parent_target_value=None)

    def _predict(self, X):
        pass

    def _predict_proba(self, X):
        pass


In [3]:
clf = ProximityTree(n_splitters = 6)

In [4]:
splitter = clf.get_candidate_splitter(X,y)
splitter

[{'1': array([-0.98381213, -1.0871899 , -1.4800256 , -1.5420523 , -1.5834034 ,
         -1.3973234 , -1.2939455 , -0.32219416,  0.60820608,  1.1044196 ,
          1.2698241 ,  1.0423929 ,  0.9183395 ,  0.40145047,  0.40145047,
          0.50482828,  0.58753052,  1.0423929 ,  1.1457707 ,  0.95969062,
          0.44280159,  0.1740193 , -0.34286972, -0.5703009 ]),
  '2': array([ 0.91295034,  0.59654878,  0.04284604, -0.58995708, -0.74815786,
         -0.66905747, -1.7764629 , -1.9346637 , -0.98545903, -0.19445513,
          0.28014722,  0.67564917,  0.51744839,  0.12194644,  0.12194644,
         -0.58995708, -0.51085669, -0.51085669, -0.27355552,  0.04284604,
          0.438348  ,  0.51744839,  2.4949582 ,  2.0203558 ])},
 {'lcss': {'epsilon': 0.384, 'window': 0.145}}]

In [11]:
labels = list(splitter[0].keys())
labels

['1', '2']

In [9]:
measure = list(splitter[1].keys())[0]
measure

'lcss'

In [33]:
y_subs = [[] for label in labels]

In [34]:
y_subs[1].append(y[13])
y_subs = [np.array(ele) for ele in y_subs]
y_subs

[array([], dtype=float64), array(['2'], dtype='<U1')]

In [13]:
splitter[1][measure]

{'epsilon': 0.384, 'window': 0.145}

In [4]:
def get_best_splitter(X, y, n_splitters):
    max_gain = float('-inf')
    best_splitter = None
    # For quality analysis
    gains = []
    splitters = []
    for i in range(n_splitters):
        splitter = clf.get_candidate_splitter(X,y)
        labels = list(splitter[0].keys())
        measure = list(splitter[1].keys())[0]
        y_subs = [[] for k in range(len(labels))]
        for j in range(X.shape[0]):
            min_dist = float('inf')
            sub = None
            for k in range(len(labels)):
                dist = distance(X[j], splitter[0][labels[k]], metric = measure, kwargs = splitter[1][measure])
                if (dist<min_dist):
                    min_dist = dist
                    sub = k
            y_subs[sub].append(y[j])
        y_subs = [np.array(ele) for ele in y_subs]
        gini_index = clf.gini_gain(y, y_subs)
        gains.append(gini_index)
        splitters.append(splitter)
        if (gini_index > max_gain):
            max_gain = gini_index
            best_splitter = splitter
    return best_splitter, gains, splitters, y_subs

In [5]:
best_splitter, gains, splitters, y_subs = get_best_splitter(X,y,6)

In [6]:
gains

[0.0029773176118786937,
 0.04615381025799996,
 0.03560220213065943,
 0.22702491070608757,
 0.008705269269608601,
 0.0641445553932831]

In [7]:
splitters

[[{'1': array([-1.0468305 , -1.3498092 , -1.5129516 , -1.6294819 , -1.652788  ,
          -1.4663395 , -1.0002183 , -0.37095477,  0.65451181,  1.1672451 ,
           1.0740209 ,  0.88757239,  0.65451181,  0.39814517,  0.46806334,
           0.58459363,  0.53798152,  0.39814517,  0.51467546,  1.4236117 ,
           1.0507148 ,  0.46806334,  0.14177852, -0.39426083]),
   '2': array([-0.29284315, -1.0045632 , -1.2269757 , -1.5383533 , -1.4493883 ,
          -1.3604233 , -1.1380107 ,  0.01853437,  0.2409469 ,  1.219562  ,
           1.397492  ,  1.130597  ,  0.81921946,  0.01853437, -0.20387815,
          -0.07043063, -0.33732566, -0.42629067, -0.42629067,  0.46335942,
           1.8867996 ,  1.664387  ,  0.50784193,  0.10749939])},
  {'dtw': {'window': 0.199}}],
 [{'1': array([-0.5855154 , -0.81972157, -1.1877598 , -1.455424  , -1.7900042 ,
          -1.388508  , -1.0539277 , -0.41822529, -0.45168331,  0.11710308,
           0.51859936,  0.65243145,  0.55205738,  0.21747715, -0.11710308,


In [8]:
best_splitter

[{'1': array([-1.1015447 , -1.2969407 , -1.5314158 , -1.5900346 , -1.6291138 ,
         -1.4141783 , -1.1015447 , -0.10502533,  0.59840013,  1.0087316 ,
          0.95011285,  0.83287528,  0.77425649,  0.36392497,  0.38346457,
          0.42254376,  0.55932093,  1.2627464 ,  1.282286  ,  1.0673504 ,
          0.55932093,  0.28576659, -0.10502533, -0.47627765]),
  '2': array([-0.92155957, -1.2832933 , -1.438322  , -1.5158363 , -1.5933507 ,
         -1.5416745 , -1.4641601 , -0.68901651,  0.65456568,  1.0421375 ,
          1.1713281 ,  1.14549   ,  0.88710876,  0.78375628,  0.88710876,
          0.86127064,  0.70624192,  0.5512132 ,  0.42202262,  0.42202262,
          0.24115578,  0.34450826,  0.37034638, -0.04306353])},
 {'ddtw': {'window': 0.133}}]

In [9]:
len(y_subs)

2

In [41]:
import numpy as np

def get_best_splitter(X, y, n_splitters):
    max_gain = float('-inf')
    best_splitter = None
    best_y_subs = None  # Track y_subs for the best splitter
    # For quality analysis
    gains = []
    splitters = []
    
    for i in range(n_splitters):
        splitter = clf.get_candidate_splitter(X, y)
        labels = list(splitter[0].keys())[0]
        measure = list(splitter[1].keys())[0]
        y_subs = [[] for _ in range(len(labels))]
        
        for j in range(X.shape[0]):
            min_dist = float('inf')
            sub = None
            for k in range(len(labels)):
                dist = distance(X[j], splitter[0][labels[k]], metric=measure, kwargs=splitter[1][measure])
                if dist < min_dist:
                    min_dist = dist
                    sub = k
            y_subs[sub].append(y[j])
        
        y_subs = [np.array(ele) for ele in y_subs]
        gini_index = clf.gini_gain(y, y_subs)
        gains.append(gini_index)
        splitters.append(splitter)
        
        if gini_index > max_gain:
            max_gain = gini_index
            best_splitter = splitter
            best_y_subs = y_subs  # Update best_y_subs
    
    return best_splitter, gains, splitters, best_y_subs


In [11]:
best_splitter, gains, splitters, y_subs = get_best_splitter(X,y,6)

In [12]:
y_subs

[array(['1', '1', '2', '1', '2', '1', '1', '2', '2', '1', '1', '2', '1',
        '1', '1', '2', '1', '1', '2', '1', '1', '1', '1', '1', '2', '2',
        '1', '1', '2', '1', '2', '2', '1', '2', '1', '2', '1', '1', '2',
        '2', '1', '2', '2', '1', '1', '2', '2', '2', '1', '2', '2', '1',
        '1', '2', '1', '1', '2', '1', '2', '2', '2', '2', '1', '2', '2',
        '2', '2', '2', '1', '1', '2', '1', '1', '1', '2', '1', '1', '2',
        '1', '2', '1', '1', '1', '2', '1', '1', '1', '2', '2', '1', '2',
        '1', '2', '1', '2', '2', '2', '1', '2', '1', '1', '2', '2', '2',
        '2', '1', '2', '2', '2', '2', '2', '2', '2', '2', '1', '2', '2',
        '1', '2', '2', '2', '2', '2', '1', '1', '2', '1', '2', '1', '1',
        '1', '2', '2', '2', '2', '1', '2', '2', '1', '1', '1', '2', '1',
        '2', '1', '1', '1', '2', '2', '1', '2', '1', '1', '2', '1', '1',
        '1', '2', '1', '2', '1', '1', '1', '1', '1', '2', '2', '1', '2',
        '1', '1', '2', '1', '1', '2', '1', '1', '2'

In [13]:
len(y_subs)

2

In [14]:
gains

[0.04016552827019579,
 0.004428874219237711,
 0.2046877647162143,
 0.08169976407282095,
 0.11379071802913782,
 0.02122238041697176]

In [15]:
splitters

[[{'1': array([-0.32859526, -0.94077274, -1.3368876 , -1.4089085 , -1.5529502 ,
          -1.372898  , -1.1928458 , -0.9047623 , -0.50864746, -0.14854306,
           0.39161353,  0.71570749,  1.0398015 ,  0.31959265,  0.24757177,
          -0.11253262, -0.1845535 ,  0.35560309,  1.2558641 ,  1.5439476 ,
           1.6159685 ,  1.3999058 ,  0.85974925,  0.24757177]),
   '2': array([ 0.87310195,  0.33234204, -0.41120284, -0.6815828 , -0.95196275,
          -1.0195578 , -0.41120284, -1.6279127 , -1.2899377 , -0.54639282,
           0.12955707,  0.53512701,  0.53512701,  0.26474705, -0.27601286,
          -0.27601286, -0.47879783, -0.61398781, -0.47879783, -0.34360785,
           0.67031698,  2.3601917 ,  2.0898117 ,  1.6166468 ])},
  {'twe': {'lmbda': 4, 'nu': 0.001}}],
 [{'1': array([ 0.43000648,  0.00754397, -0.53562211, -0.89773284, -1.0184364 ,
          -0.83738105, -0.53562211, -0.05280782, -0.53562211,  0.3696547 ,
           0.79211719,  0.43000648,  0.00754397, -0.17351139, -0.53

In [16]:
best_splitter

[{'1': array([-0.58060799, -1.1742341 , -1.3929384 , -1.3929384 , -1.4866688 ,
         -1.6428862 , -1.2054775 , -0.20568623,  0.41918333,  1.1690268 ,
          1.575192  ,  1.3252442 ,  0.82534857,  0.41918333, -0.01822537,
         -0.04946885,  0.04426159,  0.45042681,  1.2002703 ,  1.2002703 ,
          0.88783553,  0.35669638, -0.14319928, -0.58060799]),
  '2': array([-0.75366866, -1.1758313 , -1.3706757 , -1.4356238 , -1.5330459 ,
         -1.4356238 , -1.0784092 , -0.81861676,  0.41539726,  1.2597227 ,
          1.2921967 ,  1.3896189 ,  1.0324043 ,  0.77261183,  0.77261183,
          1.0324043 ,  1.0973524 ,  0.74013778,  0.3829232 ,  0.22055294,
         -0.29903191, -0.26655785,  0.02570862, -0.26655785])},
 {'twe': {'lmbda': 5, 'nu': 1e-05}}]