In [13]:
from typing import Any, Dict
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN, AffinityPropagation, KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    RobustScaler,
    StandardScaler,
)
from pyclustering.cluster.clarans import clarans as Clarans


class ScalerOptions:
    def __init__(self, attributes: list[str] = [], scalers: list[str] = [""]):
        self.attributes: list[str] = attributes
        self.scalers: list[str] = [name.lower() for name in scalers]


class EncoderOptions:
    def __init__(self, attributes: list[str] = [], encoders: list[str] = [""]):
        self.attributes: list[str] = attributes
        self.encoders: list[str] = [name.lower() for name in encoders]


class ModelOption:
    def __init__(self, model_name: str, options: Dict[str, Any] = {}):
        self.model_name: str = model_name.lower()
        self.options: Dict[str, Any] = options

In [14]:
from matplotlib.pyplot import axhline


def kmeans(dataset: pd.DataFrame, options: Dict[str, Any] = {}, k: int = 1):
    """
    K-Means Clustering
    return list of results : list[Results]
    # Parameters
    dataset: dataset
    options: options dictionaly for model
    k: number of clusters
    """
    n_init = options["n_init"] if "n_init" in options else 3
    max_iter = options["max_iter"] if "max_iter" in options else 300
    tol = options["tol"] if "tol" in options else 1e-4
    verbose = options["verbose"] if "verbose" in options else 0
    random_state = options["random_state"] if "random_state" in options else None
    best_predicted = None
    model = KMeans(
        n_clusters=k,
        n_init=n_init,
        max_iter=max_iter,
        tol=tol,
        verbose=verbose,
        random_state=random_state,
    )
    predicted = model.fit_predict(dataset)
    _, sizes = np.unique(predicted, return_counts=True)
    silhouette = silhouette_score(dataset, predicted)
    result = {
        "predicted": predicted,
        "k": k,
        "ninti": n_init,
        "max_iter": max_iter,
        "tol": tol,
        "verbose": verbose,
        "silhouette": silhouette,
        "centroids": pd.DataFrame(
            model.cluster_centers_,
        ),
        "random_state": random_state,
        "sizes": sizes,
    }

    return [result]


def gmm(dataset: pd.DataFrame, options: Dict[str, Any] = {}, k: int = 1):
    """
    Gaussian Mixture Clustring.
    return list of results : list[Results]
    # Parameters
    dataset: dataset
    options: options dictionary for model
        covariance_types :list[str]
    k: number of clusters
    """
    covariance_types = (
        options["covariance_types"] if "covariance_types" in options else ["full"]
    )
    tol = options["tol"] if "tol" in options else 1e-3
    max_iter = options["max_iters"] if "max_iters" in options else 100
    n_init = options["n_inits"] if "n_inits" in options else 1
    random_state = options["random_state"] if "random_state" in options else None
    verbose = options["verbose"] if "verbose" in options else 0
    results = []
    for covariance_type in covariance_types:
        model = GaussianMixture(
            n_components=k,
            covariance_type=covariance_type,
            max_iter=max_iter,
            tol=tol,
            verbose=verbose,
            random_state=random_state,
        )
        predicted = model.fit_predict(dataset)
        _, sizes = np.unique(predicted, return_counts=True)

        result = {
            "predicted": predicted,
            "k": k,
            "ninti": n_init,
            "max_iter": max_iter,
            "tol": tol,
            "verbose": verbose,
            "random_state": random_state,
            "mean": model.means_,
            "covariances": model.covariances_,
            "sizes": sizes,
        }
        results.append(result)

    return results


def clarans(dataset: pd.DataFrame, options: Dict[str, Any] = {}, k: int = 1):
    """
    Clarans Clustring.
    return list of results : list[Results]
    # Parameters
    dataset: dataset
    options: options for model
        numlocals: The number of local minima obtained
        maxneighbors: The maximum number of neighbors examined
    k: number of clusters
    """
    numlocals = options["numlocals"] if "numlocals" in options else [3]
    maxneighbors = options["maxneighbors"] if "maxneighbors" in options else [3]

    results = []
    for numlocal in numlocals:
        for maxneighbor in maxneighbors:
            model = Clarans(dataset.values, k, numlocal, maxneighbor)
            model.process()
            clusters = model.get_clusters()
            medoids = model.get_medoids()
            # labeling
            print(dataset.values.shape)
            predicted = np.zeros(dataset.values.shape[0])
            sizes = []
            for idx, cluster in enumerate(clusters):
                predicted[cluster] = idx
                sizes.append(len(cluster))
            results.append(
                {
                    "numlocals": numlocals,
                    "maxneighbor": maxneighbor,
                    "medoids": medoids,
                    "predicted": predicted,
                    "k": k,
                    "sizes": sizes,
                }
            )
    return results


def dbscan(dataset: pd.DataFrame, options: Dict[str, Any] = {}, k: int = 1):
    """
    dbscan Clustring.
    return list of results : list[Results]
    # Parameters
    dataset: dataset
    options: options for model
        min_samples:list[int],  The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
        eps:list[float],   The maximum distance between two samples for one to be considered as in the neighborhood of the other.
        algorithms:list[str]    The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors
        metrics:list[str]   The metric to use when calculating distance between instances in a feature array
    k: number of clusters
    """
    eps_list = options["eps"] if "eps" in options else [0.8]
    min_samples = options["min_samples"] if "min_samples" in options else [3]
    algorithms = options["algorithms"] if "algorithms" in options else ["auto"]
    metrics = options["metrics"] if "metrics" in options else ["euclidean"]
    results = []
    for eps in eps_list:
        for min_sample in min_samples:
            for algorithm in algorithms:
                for metric in metrics:
                    model = DBSCAN(
                        eps=eps,
                        min_samples=min_sample,
                        metric=metric,
                        algorithm=algorithm,
                    )
                    predicted = model.fit_predict(dataset)
                    _, sizes = np.unique(predicted, return_counts=True)
                    result = {
                        "predicted": predicted,
                        "k": k,
                        "eps": eps,
                        "min_sample": min_sample,
                        "algorithm": algorithm,
                        "sizes": sizes,
                    }
                    results.append(result)
    return results


def affinity(dataset, options):
    """
    dbscan Clustring.
    return list of results : list[Results]
    # Parameters
    dataset: dataset
    options: options for model
        convergence_iters:list[int],   Number of iterations with no change in the number of estimated clusters that stops the convergence.
        affinities:list[str]   Which affinity to use. At the moment ‘precomputed’ and euclidean are supported. ‘euclidean’ uses the negative squared euclidean distance between points.
    """
    convergence_iters = (
        options["convergence_iters"] if "convergence_iters" in options else [15]
    )
    max_iter = options["max_iter"] if "max_iter" in options else 200
    affinities = options["affinities"] if "affinities" in options else ["euclidean"]

    verbose = options["verbose"] if "verbose" in options else 0
    random_state = options["random_state"] if "random_state" in options else None
    results = []
    for convergence_iter in convergence_iters:
        for affinity in affinities:
            model = AffinityPropagation(
                max_iter=max_iter,
                verbose=verbose,
                random_state=random_state,
                affinity=affinity,
                convergence_iter=convergence_iter,
            )
            predicted = model.fit_predict(dataset)
            silhouette = silhouette_score(dataset, predicted)
            _, sizes = np.unique(predicted, return_counts=True)
            results.append(
                {
                    "max_iter": max_iter,
                    "verbose": verbose,
                    "convergence_iter": convergence_iter,
                    "affinity": affinity,
                    "k": len(model.cluster_centers_indices_),
                    "silhouette": silhouette,
                    "predicted": predicted,
                    "sizes": sizes,
                    "random_state": random_state,
                }
            )
    return results


def AutoML(
    X: pd.DataFrame,
    Y: pd.DataFrame,
    scale_option: ScalerOptions,
    encode_option: EncoderOptions,
    mode_list: list[ModelOption],
    k_list: list[int],
):
    def evaluate(evaluttes, y_custer):
        for evalutte in evaluttes:
            score = np.sum(np.power(evalutte["sizes"] - y_custer["sizes"], 2))
            evalutte["score"] = score
        return evaluttes

    def predict_by_y(y, k):
        ret = kmeans(y, k=k)[0]
        return {"predicted": ret["predicted"], "sizes": ret["sizes"]}

    def scale(scaler_name: str, attrs: list[str], dataset: pd.DataFrame):
        scaler_name = scaler_name.split("scaler")[0]
        if scaler_name == "":
            return dataset.copy()
        elif scaler_name == "standard":
            scaler = StandardScaler()
            pass
        elif scaler_name == "robust":
            scaler = RobustScaler()
            pass
        elif scaler_name == "minmax":
            scaler = MinMaxScaler()
            pass
        elif scaler_name == "maxabs":
            scaler = MaxAbsScaler()
            pass
        else:
            raise Exception("Not Defined Scaler")
        new_data = dataset.copy()
        new_data[attrs] = scaler.fit_transform(dataset[attrs])
        return new_data

    def encode(encoder_name: str, attrs: list[str], dataset: pd.DataFrame):
        encoder_name = encoder_name.split("encoder")[0]
        if encoder_name == "":
            return dataset.copy()
        copied = dataset.copy()
        if encoder_name == "onehot":
            encoder = OneHotEncoder()
            copied.drop(columns=attrs)
            encoded = pd.get_dummies(dataset[attrs])
            copied = pd.concat([copied, encoded], axis=1)

        elif encoder_name == "ordinal":
            encoder = OrdinalEncoder()
            copied[attrs] = encoder.fit_transform(dataset[attrs])
        else:
            raise Exception("Not Define Encoder")
        return copied

    all_results = []
    for scaler in scale_option.scalers:
        scaled = scale(scaler, scale_option.attributes, X)
        for encoder in encode_option.encoders:
            encoded = encode(encoder, encode_option.attributes, scaled)
            aff_modelList=[model for model in mode_list if model.model_name in ['affinity','affinitypropagation']]
            for model in aff_modelList:
                results = affinity(encoded, model.options)
                for result in results:
                    print(type(result['sizes']))
                    y_cluster = predict_by_y(Y, len(result['sizes']))
                    result = evaluate ([result], y_cluster)
                    all_results.extend(result)

            for k in k_list:
                y_cluster = predict_by_y(Y, k)
                for model in mode_list:
                    if model.model_name == "kmeans":
                        results = kmeans(encoded, model.options, k)

                    elif model.model_name == "gmm":
                        results = gmm(encoded, model.options, k)
                    elif model.model_name == "clarans":
                        results = clarans(encoded, model.options, k)
                    elif model.model_name == "dbscan":
                        results = dbscan(encoded, model.options, k)
                        print(results)
                    elif (
                        model.model_name == "affinity"
                        or model.model_name == "affinitypropagation"
                    ):
                        continue
                    else:
                        raise Exception("Not define Model")
                    result = evaluate(results, y_cluster)
                    all_results.extend(result)
                
            
    return all_results

def load_data():
    pd.set_option("display.max_columns", None)

    df = pd.read_csv("housing.csv")
    df.dropna(axis=0, inplace=True)
    df.reset_index(inplace=True)
    y = df.loc[:, ["median_house_value"]]
    x = df.loc[
        :,
        [
            "longitude",
            "latitude",
            "housing_median_age",
            "total_rooms",
            "total_bedrooms",
            "population",
            "households",
            "median_income",
            "ocean_proximity",
        ],
    ]
    x2 = df.iloc[:,[0]]
    x1 = df.sample(n=2, axis=1)
    copied = pd.concat([x1, x2], axis=1)
    print(copied)

    # set columns sets
    numerical_columns = [
        "longitude",
        "latitude",
        "housing_median_age",
        "total_rooms",
        "total_bedrooms",
        "population",
        "households",
        "median_income",
    ]
    categorical_columns = ["ocean_proximity"]
    return x, y, numerical_columns, categorical_columns





In [15]:
def main():
    x, y, numericals, categoricals = load_data()
    scale_option = ScalerOptions(
        attributes=numericals,
        scalers=["standardscaler", "robustscaler", "minmaxscaler", "maxabsscaler"],
    )
    encode_option = EncoderOptions(
        attributes=categoricals, encoders=["onehot", "ordinal"]
    )
    # K-means, EM(GMM), CLARANS, DBSCAN, Affinity Propagation
    k_means = ModelOption(model_name="kmeans")
    gmm = ModelOption(model_name="gmm")
    clarans = ModelOption(model_name="clarans")
    dbscan = ModelOption("dbscan")
    affinity = ModelOption("affinity")
    for i in range(5):
        result1 = AutoML(
            x.iloc[:100, :],
            y.iloc[:100],
            ScalerOptions(numericals, ["standard"]),
            EncoderOptions(categoricals, ["ordinal"]),
            [
                k_means,
                # clarans,
                # dbscan,
                affinity,
            ],
            k_list=[3],
        )
        print(result1)
        print(type(result1))
    # AutoML(
    #     x,
    #     y,
    #     scale_option,
    #     encode_option,
    #     [k_means, gmm, clarans, dbscan, affinity],
    #     k_list=[3, 4, 5],
    # )




In [16]:
if __name__ == "__main__":
    main()


       population  housing_median_age  index
0           322.0                41.0      0
1          2401.0                21.0      1
2           496.0                52.0      2
3           558.0                52.0      3
4           565.0                52.0      4
...           ...                 ...    ...
20428       845.0                25.0  20635
20429       356.0                18.0  20636
20430      1007.0                17.0  20637
20431       741.0                18.0  20638
20432      1387.0                16.0  20639

[20433 rows x 3 columns]
<class 'numpy.ndarray'>
[{'max_iter': 200, 'verbose': 0, 'convergence_iter': 15, 'affinity': 'euclidean', 'k': 13, 'silhouette': 0.23085639527322566, 'predicted': array([ 1,  0,  1,  1,  5,  5,  2,  2,  2,  2,  3,  2,  2,  5,  2,  5,  3,
        3,  3,  5,  4,  3,  3,  3,  3,  4,  4,  3,  3,  4,  3,  3,  3,  4,
        3,  4,  3,  3,  5,  2,  5,  4,  4,  5,  5,  5,  5,  4,  4,  4,  9,
        3, 12,  3,  4,  4,  4,  4,  4,  9,  7,