### Load data

In [1]:
import pickle
with open('splitData.pickle', 'rb') as f:
    texts_train = pickle.load(f)
    texts_val = pickle.load(f)
    texts_test = pickle.load(f)
    labels_train = pickle.load(f)
    labels_val = pickle.load(f)
    labels_test = pickle.load(f)

texts  = texts_train  + texts_val  + texts_test
labels = labels_train + labels_val + labels_test
# check the ratio of each dataset
total = len(texts_train) + len(texts_val) + len(texts_test)
len(texts_train)/total, len(texts_val)/total, len(texts_test)/total

(0.5997757847533632, 0.20011210762331838, 0.20011210762331838)

# Q2-Clustering

### Step 0: Vectorise text

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

texts_vector = vectorizer.fit_transform(texts)
texts_vector.shape

(1784, 22406)

## Conducting Kmeans

In [277]:
import numpy as np
def kmeans(k, matrix, max_iter=300):
    num_data, dim_feature = matrix.shape

    # Step1: pick k random centroids
    #### centroids = np.random.random([k,22406]) -- to be deleted
    # pick centroids from data randomly to make sure at least one point is allocated to one point
    centroids = matrix[np.random.randint(0,num_data,k)]


    # new centroids to be calculated
    new_centroids = np.zeros((k,dim_feature))

    # labels(=index of centroids) assigned to data
    labels = np.zeros(num_data)

    # iterate until convergence or max_iteration
    for itr in range(max_iter):
        
        # iterate through all data
        for i in range(num_data):
            # matrix is a sparse vector, so make it dense array
            ith_vec = np.squeeze(np.asarray(matrix[i].todense()))

            #Step2: Assign each vector to its closest centroid
            dists = np.sum(np.square(centroids - ith_vec), axis=1)
            labels[i] = np.argmin(dists)

        #Step3:Recalculate the centroids
        for j in range(k):
            new_centroids[j] = matrix[labels==j].mean(axis=0)

        # If step 2 and 3 converges, exit for-loop         
        if np.all(new_centroids == centroids):
            print("converged")
            break;
        centroids = new_centroids

    return labels, new_centroids


kmeans(5,texts_vector)


converged


(array([4., 3., 0., ..., 0., 0., 0.]),
 array([[8.95291492e-05, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 8.33883006e-05, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         4.80700128e-04, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 2.75341432e-04, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 4.51193508e-05, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 7.37179081e-05]]))

### Step 1: pick k random centroids

In [187]:
k = 5
import numpy as np
centroids = np.random.rand([k,22406])

TypeError: 'list' object cannot be interpreted as an integer

### Step 2: Assign each vector to its closest centroid

In [81]:
a = np.matrix([[1,8,3],[1,2,3],[4,5,6]])
print(np.diag(a@a.T))
np.argmin(np.diag(a@a.T))
type(texts_vector[0]- centroids[0])
a = np.squeeze(texts_vector.todense())[0]
a.flatten(order='C')
a = np.matrix([[1,2,3],[2,3,4]])
np.squeeze(np.asarray(a[0]))
a[0]
chooseClosest(np.random.random(100), n]))

[74 14 77]


AttributeError: 'numpy.ndarray' object has no attribute 'todense'

In [218]:
texts_vector[0]

<1x22406 sparse matrix of type '<class 'numpy.float64'>'
	with 89 stored elements in Compressed Sparse Row format>

In [205]:
def kmeans(k, X, max_iter=300):
    X_size,n_features = X.shape
    
    # ランダムに重心の初期値を初期化
    centroids  = X[np.random.choice(X_size,k)]
    print(centroids.shape)
    print(X[0])
    
    # 前の重心と比較するために、仮に新しい重心を入れておく配列を用意
    new_centroids = np.zeros((k, n_features))
    
    # 各データ所属クラスタ情報を保存する配列を用意
    cluster = np.zeros(X_size)
    
    # ループ上限回数まで繰り返し
    for epoch in range(max_iter):
        
        # 入力データ全てに対して繰り返し
        for i in range(X_size):
            
            # データから各重心までの距離を計算（ルートを取らなくても大小関係は変わらないので省略）
            distances = np.sum((centroids - X[i]) ** 2, axis=1)
            
            # データの所属クラスタを距離の一番近い重心を持つものに更新
            cluster[i] = np.argsort(distances)[0]
            
        # すべてのクラスタに対して重心を再計算
        for j in range(k):
            new_centroids[j] = X[cluster==j].mean(axis=0)
            
        # もしも重心が変わっていなかったら終了
        if np.sum(new_centroids == centroids) == k:
            print("break")
            break
        centroids =  new_centroids
    return cluster

In [206]:
kmeans(5,texts_vector)

(5, 22406)
  (0, 9731)	0.03630563141937073
  (0, 18077)	0.006669801828950603
  (0, 7039)	0.02297311195087983
  (0, 1525)	0.008617157780715076
  (0, 4251)	0.056729367699878484
  (0, 745)	0.013062431082661039
  (0, 18176)	0.042811557376255534
  (0, 13646)	0.0459019624129647
  (0, 799)	0.004739743781593408
  (0, 12444)	0.004745058255048318
  (0, 13980)	0.012314383326603123
  (0, 6224)	0.12567955188364327
  (0, 12681)	0.14649717567928672
  (0, 7782)	0.04925040674637123
  (0, 7040)	0.04173540802092905
  (0, 5524)	0.103250760042658
  (0, 21418)	0.13554021448705597
  (0, 3129)	0.02653507864079888
  (0, 3344)	0.031247354943097196
  (0, 1156)	0.018945090647647145
  (0, 21559)	0.04631600532649999
  (0, 2979)	0.04629710100060329
  (0, 2357)	0.01822478138222561
  (0, 7171)	0.06898201586663655
  (0, 18683)	0.0118155057017452
  :	:
  (0, 18840)	0.01883993345854352
  (0, 10185)	0.0068167841169117895
  (0, 7989)	0.01625671310928976
  (0, 8618)	0.029848084472949555
  (0, 13008)	0.005556207205176289
  (

ValueError: inconsistent shapes

In [95]:
# choose the closest point
def chooseClosest(p1, centroids):
    #
    p1_array = np.squeeze(np.asarray(p1.todense()))
    diff_mat =  centroids - p1_array
    dist2 = np.diag(diff_mat @ diff_mat.T)
    assigned = np.argmin(dist2)
    # print(dist2)
    return assigned
chooseClosest(texts_vector[0],centroids)

2

In [67]:
# a = np.random.random((2,22406))
# np.apply_along_axis(chooseClosest(),axis=0,arr=a,args=centroids)

TypeError: chooseClosest() missing 2 required positional arguments: 'p1' and 'centroids'

In [135]:
centroids = np.random.random([k,22406])
def assignToCentroid(matrix, centroids):
    new_labels = []
    for i in range(matrix.shape[0]):
        new_labels.append(chooseClosest(matrix[i],centroids))
    return new_labels
new_labels = assignToCentroid(texts_vector, centroids)
Counter(new_labels)

Counter({4: 1784})

### recalculate the centroids based on the closest vectors

In [182]:
def recalcCentroids(matrix, new_labels):
    new_centroids = []
    dim = len(new_labels)
    for i in range(k):
        ith = np.vstack([matrix[j] for j in range(dim) if new_labels[j] == i])
        # print(ith)
        new_centroids.append(np.mean(ith,axis=0))
    return (np.vstack(new_centroids))
a = recalcCentroids(texts_vector.todense(), new_labels)
# # texts_vector.todense()
# new_labels[0] = 0
# new_labels[1] = 1
# new_labels[2] = 2
# new_labels[3] = 3
# Counter(new_labels)
np.max(a[4])

0.13432953110365448

In [173]:
# def recalcCentroids(matrix, new_labels):
a = np.matrix(np.random.random([5,10]))
print(type(a))
la = [0,1,0,1,1]
# b = np.matrix([a[i] for i in range(len(la)) if la[i]==1])
# np.mean(b,axis=0)
tet=np.vstack(([a[i] for i in range(5) if la[i] == 1]))

<class 'numpy.matrix'>


In [214]:
from sklearn.cluster import KMeans
from collections import Counter
kmeans = KMeans(n_clusters=10, random_state=10).fit_predict(texts_vector)
Counter(kmeans)

Counter({6: 689,
         1: 303,
         3: 30,
         8: 364,
         4: 77,
         9: 160,
         7: 19,
         2: 64,
         0: 17,
         5: 61})