In [26]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import bokeh
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
from bokeh.models import CategoricalColorMapper, HoverTool
from bokeh.palettes import Dark2_5 as palette_alpha

import pandas as pd
import numpy as np

output_notebook()
%matplotlib inline

In [12]:
features, cluster = make_blobs(n_samples=300, n_features=2)

In [13]:
p = figure(plot_width=400, plot_height=400)

# add a circle renderer with a size, color, and alpha
p.circle(features[:, 0], features[:, 1], size=20, color="navy", alpha=0.5)

# show the results
show(p)

In [23]:
km = KMeans(n_clusters=5).fit(features)

df = pd.DataFrame(features, columns=['x', 'y'])
# Need to stringify, but only for bokeh (cannot accept integers for categorical labels!)
df['cluster'] = [str(label) for label in km.predict(features)]

In [24]:
tooltips = [
    ("x value", "@x"),
    ("y value", "@y"),
    ("cluster", "@cluster")
]


color_mapper = CategoricalColorMapper(factors=df.cluster.unique(), 
                                      palette=palette_alpha)

p = figure(plot_width=400, plot_height=400)

p.add_tools(HoverTool(
    tooltips=tooltips
))

# add a circle renderer with a size, color, and alpha
p.circle("x", "y", source=df, size=20, 
         color={
             'field': 'cluster',
             'transform': color_mapper
         }, alpha=0.5)

# show the results
show(p)

In [9]:
[str(c) for c in df.cluster.unique()]

['1', '0']

In [16]:
km.predict(features)

array([1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], d

In [63]:
def bootstrap_cluster(df, n):
    X = df.sample(frac=1.0, replace=True)
    km = KMeans(n_clusters=n).fit(X)
    cluster_series = pd.Series(km.labels_, X.index)
    return cluster_series.loc[~cluster_series.index.duplicated(keep='first')]

In [66]:
bootstrap_cluster(df.drop('cluster', axis=1), 3)

186    2
154    2
3      2
82     0
152    2
22     2
57     0
141    1
130    2
288    1
31     0
117    1
200    1
49     0
270    0
36     2
114    1
6      1
13     0
64     1
83     2
85     1
29     2
26     0
7      2
183    0
177    1
40     0
151    1
176    2
      ..
0      1
25     2
67     1
251    2
28     1
285    2
105    0
231    2
101    0
201    1
227    1
169    2
282    1
206    0
295    1
65     2
108    1
54     2
210    0
102    2
292    1
17     1
276    1
170    0
293    2
48     0
100    0
261    2
225    1
172    0
Length: 199, dtype: int32

In [54]:
d1 = pd.Series([1,1,0,1])
d2 = pd.Series([0,1,1,1], index=[0,2,3,1])


In [62]:
df3 = pd.Series(clusters, index)
df3 = df3.loc[~df3.index.duplicated(keep='first')]
df3

284    1
37     0
241    1
96     0
226    1
100    1
234    1
280    0
136    2
118    0
43     0
271    0
245    1
14     1
196    0
22     2
199    0
279    2
254    2
244    2
242    2
16     0
261    2
74     0
144    1
104    1
79     1
166    1
2      1
164    2
      ..
182    1
124    1
120    2
239    1
66     1
293    2
95     0
63     2
220    0
165    1
256    1
9      1
42     1
267    1
75     0
294    2
143    1
273    1
115    2
25     2
53     1
56     2
4      0
172    1
276    0
177    0
91     1
184    1
183    1
51     2
Length: 196, dtype: int32

In [51]:
d2

0    0
2    1
3    1
1    1
dtype: int64

In [52]:
d1 & d2

0    False
1     True
2    False
3     True
dtype: bool

Better idea: return the clusters created by the bootstrap method, then predict on _everything_!