In [1]:
from numpy import argmax
from wine_functions import fetch_wine_data, create_wine_dataframe, split_dataset
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score

# fetch_wine_data()

wine = create_wine_dataframe()

train, train_labels, test, test_labels = split_dataset(wine)

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train)

k_range = list(range(2,16))

In [2]:
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,color
3123,6.6,0.21,0.49,18.15,0.042,41.0,158.0,0.9997,3.28,0.39,8.7,0
196,7.3,0.58,0.3,2.4,0.074,15.0,55.0,0.9968,3.46,0.59,10.2,1
1704,7.3,0.13,0.32,14.4,0.051,34.0,109.0,0.9974,3.2,0.35,9.2,0
3417,6.0,0.26,0.29,1.0,0.032,27.0,96.0,0.9896,3.38,0.44,12.3,0
4422,7.6,0.27,0.34,5.0,0.04,18.0,56.0,0.99084,3.06,0.48,12.4,0


In [7]:
# scale data, then do classification with a few columns
# use silhouette score to find best number of clusters for these columns
# check that these are tight clusters (decide on a metric - not visualization)

print(train_scaled[:10])

[[-0.46935245 -0.78073241  1.17454856  2.66686169 -0.40269511  0.59551226
   0.75338272  1.67253327  0.3763427  -0.96224797 -1.50751815 -0.57135108]
 [ 0.06920116  1.44737084 -0.12923929 -0.63316943  0.52435095 -0.87338284
  -1.07695808  0.70627749  1.48969631  0.3976081  -0.24986334  1.75023733]
 [ 0.06920116 -1.26248447  0.00800153  1.88114    -0.1419634   0.20004051
  -0.11736193  0.90619248 -0.11848112 -1.23421919 -1.08829988 -0.57135108]
 [-0.93096983 -0.47963738 -0.19785971 -0.92650553 -0.692397   -0.19543125
  -0.34837582 -1.69270236  0.99487249 -0.62228396  1.51085339 -0.57135108]
 [ 0.30000985 -0.41941837  0.14524236 -0.08840238 -0.46063549 -0.70389494
  -1.05918778 -1.27954472 -0.98442282 -0.35031274  1.59469705 -0.57135108]
 [ 0.14613739 -1.20226546 -0.12923929 -0.86364779 -0.66342681 -0.30842318
  -0.43722732 -0.66647209  1.92266716 -0.96224797  0.08551128 -0.57135108]
 [-0.62322491 -1.14204645  0.28248319 -0.7588849  -0.63445662  0.7085042
   0.61122032 -0.82640408  1.2422

In [4]:
columns_1 = train_scaled[:, :4]
cluster_train_1 = columns_1.copy()

scores_1 = [-1, -1] # dummy values for n=0,1

for k in k_range:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(cluster_train_1)
    score = silhouette_score(cluster_train_1, kmeans.labels_)
    scores_1.append(score)
    print(f"{k} clusters silhouette score: {score:.5f}")

score_index_1 = argmax(scores_1)
print(f"{score_index_1} clusters with a silhouette score of "
      f"{scores_1[score_index_1]:.5f}")

2 clusters silhouette score: 0.33320
3 clusters silhouette score: 0.33183
4 clusters silhouette score: 0.33593
5 clusters silhouette score: 0.26909
6 clusters silhouette score: 0.26779
7 clusters silhouette score: 0.25776
8 clusters silhouette score: 0.24467
9 clusters silhouette score: 0.23856
10 clusters silhouette score: 0.24538
11 clusters silhouette score: 0.22499
12 clusters silhouette score: 0.23301
13 clusters silhouette score: 0.23119
14 clusters silhouette score: 0.21507
15 clusters silhouette score: 0.23090
4 clusters with a silhouette score of 0.33593


In [5]:
columns_2 = train_scaled[:, 4:8]
cluster_train_2 = columns_2.copy()

scores_2 = [-1, -1]

for k in k_range:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(cluster_train_2)
    score = silhouette_score(cluster_train_2, kmeans.labels_)
    scores_2.append(score)
    print(f"{k} clusters silhouette score: {score:.5f}")

score_index_2 = argmax(scores_2)
print(f"{score_index_2} clusters with a silhouette score of "
      f"{scores_2[score_index_2]:.5f}")

2 clusters silhouette score: 0.36239
3 clusters silhouette score: 0.36947
4 clusters silhouette score: 0.38089
5 clusters silhouette score: 0.32368
6 clusters silhouette score: 0.29561
7 clusters silhouette score: 0.30001
8 clusters silhouette score: 0.28110
9 clusters silhouette score: 0.25384
10 clusters silhouette score: 0.24827
11 clusters silhouette score: 0.25000
12 clusters silhouette score: 0.23860
13 clusters silhouette score: 0.23632
14 clusters silhouette score: 0.23953
15 clusters silhouette score: 0.22886
4 clusters with a silhouette score of 0.38089


In [6]:
columns_3 = train_scaled[:, 8:]
cluster_train_3 = columns_3.copy()

scores_3 = [-1, -1]

for k in k_range:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(cluster_train_3)
    score = silhouette_score(cluster_train_3, kmeans.labels_)
    scores_3.append(score)
    print(f"{k} clusters silhouette score: {score:.5f}")

score_index_3 = argmax(scores_3)
print(f"{score_index_3} clusters with a silhouette score of "
      f"{scores_3[score_index_3]:.5f}")

2 clusters silhouette score: 0.40130
3 clusters silhouette score: 0.31373
4 clusters silhouette score: 0.31099
5 clusters silhouette score: 0.31228
6 clusters silhouette score: 0.30897
7 clusters silhouette score: 0.30901
8 clusters silhouette score: 0.29055
9 clusters silhouette score: 0.25677
10 clusters silhouette score: 0.26374
11 clusters silhouette score: 0.26249
12 clusters silhouette score: 0.26007
13 clusters silhouette score: 0.25083
14 clusters silhouette score: 0.25396
15 clusters silhouette score: 0.25538
2 clusters with a silhouette score of 0.40130


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# testing columntransformer

cluster_attribs = list(range(8, 12))

cluster_pipeline = ColumnTransformer(
    [("kmeans", KMeans(n_clusters=2), cluster_attribs)],
    remainder="passthrough"
)

full_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ('col_cluster', cluster_pipeline),
])

train_scaled_clustered = full_pipeline.fit_transform(train)

print(train_scaled_clustered[:10])

[[ 1.75782032  3.25514088 -0.46935245 -0.78073241  1.17454856  2.66686169
  -0.40269511  0.59551226  0.75338272  1.67253327]
 [ 2.98052472  1.01529135  0.06920116  1.44737084 -0.12923929 -0.63316943
   0.52435095 -0.87338284 -1.07695808  0.70627749]
 [ 1.43948397  3.33590072  0.06920116 -1.26248447  0.00800153  1.88114
  -0.1419634   0.20004051 -0.11736193  0.90619248]
 [ 1.95564784  3.11760132 -0.93096983 -0.47963738 -0.19785971 -0.92650553
  -0.692397   -0.19543125 -0.34837582 -1.69270236]
 [ 1.75723116  3.42720221  0.30000985 -0.41941837  0.14524236 -0.08840238
  -0.46063549 -0.70389494 -1.05918778 -1.27954472]
 [ 2.2425184   3.16465363  0.14613739 -1.20226546 -0.12923929 -0.86364779
  -0.66342681 -0.30842318 -0.43722732 -0.66647209]
 [ 2.05960459  2.34444538 -0.62322491 -1.14204645  0.28248319 -0.7588849
  -0.63445662  0.7085042   0.61122032 -0.82640408]
 [ 2.72512789  1.84198472 -0.16160753  1.68824687 -0.95268425 -0.82174263
   0.17670868 -1.32535056 -1.16580957  0.1398517 ]
 [ 2