In [17]:
import numpy as np
import pandas as pd
from collections import Counter

from sklearn import metrics
from sklearn import preprocessing
from sklearn import cluster

In [2]:
print(metrics.rand_score([1,1,1,2,2], [1,1,2,2,3]))

0.6


In [3]:
print(metrics.adjusted_rand_score([1,1,1,2,2], [1,1,2,2,3]))

0.09090909090909091


In [8]:
df_train = np.loadtxt('data/train.txt')
df_test = np.loadtxt('data/test.txt')
y_train = np.loadtxt('data/train_labels.txt')
y_test = np.loadtxt('data/test_labels.txt')

In [10]:
X = np.concatenate((df_train, df_test))
y = np.concatenate((y_train, y_test))

In [11]:
X.shape

(10299, 561)

In [13]:
cnt = Counter(y)
cnt

Counter({5.0: 1906, 4.0: 1777, 6.0: 1944, 1.0: 1722, 3.0: 1406, 2.0: 1544})

In [14]:
len(np.unique(y))

6

In [16]:
scaler = preprocessing.StandardScaler()
X_scaled = scaler.fit_transform(X)
round(X_scaled[0][0],2)

0.21

In [20]:
sil_score = []
kh_score = []
db_score = []

for i in range(2, 10):
    model = cluster.KMeans(n_clusters=i, random_state=42)
    model.fit(X_scaled)
    sil_score.append(metrics.silhouette_score(X_scaled, model.labels_))
    kh_score.append(metrics.calinski_harabasz_score(X_scaled, model.labels_))
    db_score.append(metrics.davies_bouldin_score(X_scaled, model.labels_))
    
    

In [21]:
print(sil_score)
print(kh_score)
print(db_score)

[0.39373247640770803, 0.31548375272734164, 0.15052911849826184, 0.12723671885995455, 0.11096892121098482, 0.08541910378666204, 0.07618332955957029, 0.07648837092046107]
[7880.813903657111, 5034.4752572242, 3696.3381978005555, 3027.0761722557045, 2556.7735736855657, 2216.563937360621, 1974.9714963243528, 1790.9531728187399]
[1.070744182238566, 1.786516465365282, 2.3409301040454653, 2.431375173184107, 2.367036374724894, 2.6819795075319033, 2.6112259939773117, 2.581888277475603]


In [24]:
model = cluster.KMeans(n_clusters=6, random_state=42)
model.fit(X_scaled)

print(metrics.homogeneity_score(y, model.labels_))
print(metrics.completeness_score(y, model.labels_))
print(metrics.adjusted_rand_score(y, model.labels_))

0.5404263978615286
0.5807797117737106
0.419224271062724


In [28]:
model.labels_

array([5, 5, 5, ..., 1, 1, 1], dtype=int32)

In [38]:
cnt = {0:[], 1:[], 2:[], 3:[], 4:[], 5:[]}
for i, label in enumerate(model.labels_):
    #print(model.labels_[i])
    cnt[label].append(y[i])
    

In [41]:
c1 = Counter(cnt[0])
c2 = Counter(cnt[1])
c3 = Counter(cnt[2])
c4 = Counter(cnt[3])
c5 = Counter(cnt[4])
c6 = Counter(cnt[5])


In [43]:
print(c1)
print(c2)
print(c3)
print(c4)
print(c5)
print(c6)

Counter({5.0: 566, 4.0: 451, 6.0: 330, 2.0: 2})
Counter({2.0: 1242, 1.0: 903, 3.0: 321, 6.0: 5, 4.0: 1})
Counter({6.0: 1556, 4.0: 91})
Counter({3.0: 889, 1.0: 742, 2.0: 295})
Counter({3.0: 196, 1.0: 77, 2.0: 5})
Counter({5.0: 1340, 4.0: 1234, 6.0: 53})


In [48]:
ct = pd.crosstab(y, model.labels_)
ct.index = ['хотьба','подъем', 'спуск', 'сиденье', 'стоянье', 'лежанье']
ct.columns = list(range(1, 7))

In [49]:
ct

Unnamed: 0,1,2,3,4,5,6
хотьба,0,903,0,742,77,0
подъем,2,1242,0,295,5,0
спуск,0,321,0,889,196,0
сиденье,451,1,91,0,0,1234
стоянье,566,0,0,0,0,1340
лежанье,330,5,1556,0,0,53


In [50]:
model = cluster.KMeans(n_clusters=2, random_state=42)
model.fit(X_scaled)

In [51]:
ct = pd.crosstab(y, model.labels_)
ct.index = ['хотьба','подъем', 'спуск', 'сиденье', 'стоянье', 'лежанье']
ct.columns = list(range(1, 3))

In [52]:
ct

Unnamed: 0,1,2
хотьба,1722,0
подъем,1536,8
спуск,1406,0
сиденье,3,1774
стоянье,0,1906
лежанье,12,1932


In [53]:
print(metrics.completeness_score(y, model.labels_))

0.979530559699631


In [54]:
model = cluster.AgglomerativeClustering(n_clusters=2)
model.fit(X_scaled)
print(metrics.completeness_score(y, model.labels_))

0.9999999999999993
