In [6]:
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, clone
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.metaestimators import available_if
from sklearn.utils.validation import check_is_fitted

import numpy as np; import pandas as pd
from typing import List, Union

In [7]:
def _classifier_has(attr):
    """Check if we can delegate a method to the underlying classifier.

    First, we check the first fitted classifier if available, otherwise we
    check the unfitted classifier.
    """
    return lambda estimator: (
        hasattr(estimator.classifier_, attr)
        if hasattr(estimator, "classifier_")
        else hasattr(estimator.classifier, attr)
    )

In [8]:
class InductiveClusterer(BaseEstimator):
    def __init__(self, clusterer, classifier):
        self.clusterer = clusterer
        self.classifier = classifier

    def fit(self, X, y=None):
        self.clusterer_ = clone(self.clusterer)
        self.classifier_ = clone(self.classifier)
        y = self.clusterer_.fit_predict(X)
        self.classifier_.fit(X, y)
        return self

    @available_if(_classifier_has("predict"))
    def predict(self, X):
        check_is_fitted(self)
        return self.classifier_.predict(X)

    @available_if(_classifier_has("decision_function"))
    def decision_function(self, X):
        check_is_fitted(self)
        return self.classifier_.decision_function(X)


In [55]:
np.square(np.array([1,2,3])-3).sum()

5

In [185]:
class one_way_ANOVA(BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        self.response = (x for x in y)
        self.segment_means = pd.DataFrame(zip(X, y), columns = ['x','y']).groupby(
            'x')['y'].mean()
        return self
    
    def fit_transform(self, X, y):
        return self.fit(X, y).predict(X)
    
    def predict(self, X):
        if hasattr(self, 'segment_means'):
            if hasattr(self, '_rsq'):
                delattr(self, '_rsq')
            self.y_pred = (self.segment_means.at[x] for x in X)
            return np.array([self.segment_means.at[x] for x in X])
        else:
            raise ValueError('Fit the model first')
        
    @property
    def score(self):
        if hasattr(self, '_rsq'):
            return self._rsq
        else:
            y = np.array(list(self.response))
            y_pred = np.array(list(self.y_pred))
            ssto = np.square(y - np.mean(y)).sum()
            sse  = np.square(y - y_pred).sum()
            rsq  = (ssto-sse)/ssto
            self._rsq = rsq
            return rsq

In [186]:
class ClusteredSegmentation(BaseEstimator):
    def __init__(self, clusterer, regressor = one_way_ANOVA()):
        self.clusterer = clusterer
        self.regressor = regressor
        
    def fit_transform(self, X: Union[pd.Series, np.ndarray, List], y: Union[pd.Series, np.ndarray, List], n_clusters):
        if isinstance(X, np.ndarray):
            X = X.reshape(-1)
        if isinstance(y, np.ndarray):
            y = y.reshape(-1)
        
        segment_means = pd.DataFrame(zip(X, y), columns = ['x','y']).groupby('x')['y'].mean()
        y_pred = map(lambda x: segment_means.at[x], X)
        
        self.clusterer_ = clone(self.clusterer)
        self.regressor_ = clone(self.regressor)
        self.clusterer_.set_params(n_clusters = n_clusters)
        
        group_id_raw = self.clusterer_.fit_predict(np.array(list(y_pred)).reshape(-1,1))
        cluster_means = pd.DataFrame(zip(group_id_raw, y), columns = ['x','y']).groupby('x')['y'].mean(
        ).sort_values().reset_index().reset_index().set_index('x')['index']
        y_pred = map(lambda x: cluster_means.at[x], group_id_raw)
        group_id_map = cluster_means.sort_values().reset_index().reset_index().set_index('x')['index']
        self.clusterer_.labels_ = np.array([group_id_map.at[x] for x in group_id_raw])
        y_pred = self.regressor_.fit_transform(self.clusterer_.labels_, y)
        return y_pred
    
    @property
    def score(self):
        if hasattr(self, "clusterer_"):
            return self.regressor_.score
        else:
            raise AttributeError("Fit-transform the estimator first")
            

In [187]:
clusterer = AgglomerativeClustering()

In [188]:
c = ClusteredSegmentation(clusterer)

In [189]:
c

In [193]:
max_group_index = 2**10-1
p = 0
while 2**p < max_group_index:
    p += 1
p = 2**p ############################################ define p <<<<<


In [194]:
p

1024

In [190]:
from Dissertation import RandomGenerator, convert_to_int
rg = RandomGenerator(10, 1_000, order_by = 'var', use_dask = False, verbose= False)
X = rg._X.apply(convert_to_int, axis = 1).to_numpy().reshape(-1,1)
y = rg()['y'].to_numpy().reshape(-1,1)


In [191]:
c.fit_transform(X, y, n_clusters = rg.config.parameter_size).shape

(1000,)

In [192]:
c.score

0.8079594750889644

In [153]:
c.regressor_._rsq

In [127]:
c.regressor_.fit_transform(c.clusterer_.labels_, y)

In [131]:
c.regressor_.fit(c.clusterer_.labels_, y).predict(c.clusterer_.labels_)

array([[ 1.93799718e+01],
       [-4.16925235e+00],
       [ 1.03972759e+01],
       [-8.92875311e+00],
       [ 5.20566902e-01],
       [-6.72663756e-01],
       [ 1.45745339e+00],
       [ 3.58405995e-03],
       [-3.14639506e+00],
       [ 1.45745339e+00],
       [-9.44725773e+00],
       [-1.24076489e+01],
       [ 4.83275832e+00],
       [ 7.70628934e+00],
       [ 8.37334932e+00],
       [-1.81511594e+00],
       [-2.39416913e+00],
       [-1.10916169e+01],
       [-4.80259113e+00],
       [-1.26263291e+00],
       [ 7.70628934e+00],
       [-6.72663756e-01],
       [-1.10916169e+01],
       [ 5.20566902e-01],
       [-1.34315632e+01],
       [ 4.83275832e+00],
       [ 4.83275832e+00],
       [ 1.45745339e+00],
       [-6.73242662e+00],
       [-1.26263291e+00],
       [ 1.45745339e+00],
       [ 2.26847775e+00],
       [-3.72323806e+00],
       [-6.72663756e-01],
       [-4.80259113e+00],
       [ 1.64502166e+01],
       [-1.10916169e+01],
       [-9.44725773e+00],
       [ 8.3

In [33]:
c = ClusteredSegmentation('hello')

In [36]:
c.set_params(**{'clusterer':'bye'})

ClusteredSegmentation(clusterer='bye')

In [37]:
c.get_params()

{'clusterer': 'bye'}