In [None]:
import numpy as np
import holoviews as hv
from sklearn.mixture import BayesianGaussianMixture
import pandas as pd
import logging
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
from dpm.dpgmm import WeightedDPGMM
random_seed = 43
np.random.seed(random_seed)

hv.extension("bokeh")

### make some data

In [None]:
num_clusters = 10
N = 200

x_means = 20 * np.random.rand(1, num_clusters, 2) - 10
y = np.random.randint(num_clusters, size=N)
x = .08 * np.random.randn(N, 1, 2)

temp = np.zeros((N, num_clusters, 1))
temp[np.arange(N), y, :] = 1

x = (x + x_means * temp).sum(1)

# sample_weight =

x_df = pd.DataFrame(x, columns=["x", "y"])
x_df["weight"] = np.random.randint(1, 50, size=len(x))
x_df["true_cluster"] = y.astype(str)

### fit weighted model

In [None]:
model = WeightedDPGMM(n_components=20, verbose=1, max_iter=1000, random_state=random_seed)
x_df["predicted_cluster"] = model.fit_predict(x, sample_weight=x_df["weight"]).astype(str)

### fit unweighted model
basically replicate each data point "sample_weight" times

In [None]:
model_unweighted = BayesianGaussianMixture(n_components=20, verbose=1, random_state=random_seed)
x_dupl = np.concatenate(
    [np.tile(row.loc[["x", "y"]].values.reshape(1, -1), (row["weight"], 1)) for i, row in x_df.T.items()], 0)

x_dupl = pd.DataFrame(x_dupl, columns=["x", "y"])
x_dupl["predicted_cluster"] = model_unweighted.fit_predict(x_dupl.values).astype(str)

### plot and compare

In [None]:
plot_points = hv.Points(x_df, kdims=["x", "y"], vdims=["true_cluster", "weight"], label="true").opts(color="true_cluster", fill_alpha=.9 * hv.dim("weight").norm() + .1) + \
              hv.Points(x_df, kdims=["x", "y"], vdims=["predicted_cluster", "weight"],label="predicted weighted").opts(color="predicted_cluster", fill_alpha=.9 * hv.dim("weight").norm() + .1) + \
              hv.Points(x_dupl, kdims=["x", "y"], vdims=["predicted_cluster"],label="predicted unweighted").opts(color="predicted_cluster",fill_alpha=.1)


true_prior = x_df.groupby("true_cluster")["weight"].sum().sort_values(ascending=False)
true_prior  /= true_prior.sum()
prior_plots = hv.Curve(true_prior.to_frame("prior").reset_index(drop=True).reset_index(),["index"],["prior"], label="true").opts(width=400,  line_width=3)*\
              hv.Curve(np.sort(model.weights_[model.weights_>=1e-3])[::-1],["index"],["prior"],label="predicted weighted").opts(width=400,  line_width=3)*\
              hv.Curve(np.sort(model_unweighted.weights_[model_unweighted.weights_>=1e-3])[::-1],["index"],["prior"],label="predicted unweighted").opts(width=400,  line_width=3)


plot_points = plot_points.opts(hv.opts.Points(cmap="Category20", size=10, width=400, height=400, show_legend=False,tools=['hover']))
(plot_points + prior_plots).cols(3)