In [2]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from collections import defaultdict
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances , _euclidean_distances
import numbers

In [3]:
#################################################################
# Load Dataset
#################################################################

dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    shuffle=True,
    random_state=42,
)
# categories = [
#     "alt.atheism",
#     "talk.religion.misc",
#     "comp.graphics",
#     "sci.space",
# ]
 
# dataset = fetch_20newsgroups(
#     remove=("headers", "footers", "quotes"),
#     subset="all",
#     categories=categories,
#     shuffle=True,
#     random_state=42,
# )
 

labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]

In [4]:
print("labels.shape ", labels.shape)
print("unique_labels.shape ", unique_labels.shape)
print("category_sizes.shape ", category_sizes.shape)
print("true_k ", true_k)

labels.shape  (18846,)
unique_labels.shape  (20,)
category_sizes.shape  (20,)
true_k  20


In [5]:
#################################################################
# Vectorize 
#################################################################
vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
)

X_tfidf = vectorizer.fit_transform(dataset.data)

In [6]:
X_tfidf.shape[0]

18846

In [7]:
#################################################################
# Evaluate Fitness
#################################################################
def fit_and_evaluate(km, X, n_runs=5):

    scores = defaultdict(list)
    for seed in range(n_runs):
        km.set_params(random_state=seed)
        km.fit(X)
        scores["Homogeneity"].append(metrics.homogeneity_score(labels, km.labels_))
        scores["Completeness"].append(metrics.completeness_score(labels, km.labels_))
        scores["V-measure"].append(metrics.v_measure_score(labels, km.labels_))
        scores["Adjusted Rand-Index"].append(
            metrics.adjusted_rand_score(labels, km.labels_)
        )
        scores["Silhouette Coefficient"].append(
            metrics.silhouette_score(X, km.labels_, sample_size=2000)
        )
    for score_name, score_values in scores.items():
        mean_score, std_score = np.mean(score_values), np.std(score_values)
        print(f"{score_name}: {mean_score:.3f} ± {std_score:.3f}")


In [8]:
#################################################################
# (TODO): Implement K-Means  
#################################################################

def check_random_state(seed):
    # check_random_state from sklearn.utils
    if seed is None or seed is np.random:
        return np.random.mtrand._rand
    if isinstance(seed, numbers.Integral):
        return np.random.RandomState(seed)

class KMeans:
    labels_ = [] # predicted labels (y)
    def __init__(self, n_clusters, max_iter=300):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        

    def fit(self, X_train):
        n_samples, n_features = X_train.shape
        self.centroids = np.empty((self.n_clusters, n_features), dtype=X_train.dtype)

        # Randomly select centroid
        random_state = check_random_state(self.random_state)
        center_id = random_state.choice(n_samples) # return an integer
        self.centroids[0] = X_train[center_id].toarray() 

        # Initialize K (n_clusters) centroids #
        for c in range(1, self.n_clusters):
            # Calculate distances from points to the centroids
            dists = np.sum(euclidean_distances(self.centroids, X_train), axis=0)
            dists = dists/np.sum(dists) # Normalize the distances
            # Choose remaining points based on their distances
            new_centroid_idx = random_state.choice(n_samples, size=1, p=dists)
            self.centroids[c] = X_train[new_centroid_idx].toarray()
            
        # Iterate, adjust centroids until converged or until passed max_iter
        iteration = 0
        prev_centroids = None
        while np.not_equal(self.centroids, prev_centroids).any() and (iteration < self.max_iter):
            # Assigning vector to nearest centroid #
            dists = euclidean_distances(self.centroids, X_train)
            centroid_idx = np.argmin(dists, axis=0)
                
            # Save current centroids
            prev_centroids = self.centroids
            
            # Update centroids by mean of cluster #
            new_centroids = np.empty((self.n_clusters, n_features), dtype=X_train.dtype)
            for i in range(self.n_clusters):
                cluster_i = X_train[centroid_idx == i]
                if cluster_i.size == 0: 
                    new_centroids[i] = prev_centroids[i]
                else:
                    new_centroids[i] = np.mean(cluster_i, axis=0)

            # Reassign centroids to new centroids
            self.centroids = new_centroids

            iteration += 1
        
        # Now that's done, let's calculate the current labels
        distances = euclidean_distances(self.centroids, X_train)
        self.labels_ = np.argmin(distances, axis=0)
        

    def set_params(self, random_state):
        self.random_state = random_state

In [9]:
# init K-Means
kmeans = KMeans(n_clusters=true_k, max_iter=10)
fit_and_evaluate(kmeans, X_tfidf)

Homogeneity: 0.276 ± 0.018
Completeness: 0.329 ± 0.017
V-measure: 0.300 ± 0.017
Adjusted Rand-Index: 0.076 ± 0.012
Silhouette Coefficient: 0.001 ± 0.003


In [10]:
# Test with actual KMeans performance class
from sklearn.cluster import KMeans as real_KMeans
real_kmeans = real_KMeans(init="k-means++", n_clusters=true_k, n_init=4)

fit_and_evaluate(real_kmeans, X_tfidf)

Homogeneity: 0.171 ± 0.100
Completeness: 0.395 ± 0.037
V-measure: 0.218 ± 0.087
Adjusted Rand-Index: 0.046 ± 0.026
Silhouette Coefficient: 0.005 ± 0.003


In [11]:
X_tfidf.shape

(18846, 24164)

In [12]:
kmeans.centroids[0].shape

(24164,)

In [131]:
dists = euclidean_distances(kmeans.centroids, X_tfidf)
dists

array([[1.40558583, 1.40418857, 1.39503057, ..., 1.41421356, 1.39731476,
        1.38785215],
       [1.19463731, 1.19463731, 1.18892012, ..., 1.19463731, 1.19463731,
        1.18187191],
       [1.05513771, 1.0732083 , 1.06971876, ..., 1.0732083 , 1.0732083 ,
        1.0732083 ],
       [1.06920505, 1.06920505, 1.06390841, ..., 1.06920505, 1.06920505,
        1.06920505]])

In [132]:
dists.shape

(4, 3387)

In [113]:
dists = np.sum(dists, axis=0)
dists = dists / np.sum(dists)

In [148]:
dists

array([[1.40558583, 1.40418857, 1.39503057, ..., 1.41421356, 1.39731476,
        1.38785215],
       [1.19463731, 1.19463731, 1.18892012, ..., 1.19463731, 1.19463731,
        1.18187191],
       [1.05513771, 1.0732083 , 1.06971876, ..., 1.0732083 , 1.0732083 ,
        1.0732083 ],
       [1.06920505, 1.06920505, 1.06390841, ..., 1.06920505, 1.06920505,
        1.06920505]])

In [128]:
np.random.choice(range(X_tfidf.shape[0]), p=dists)

3369

In [137]:
centroid_idx = np.argmin(dists, axis=0)
centroid_idx.shape

(3387,)

In [177]:
np.mean(X_tfidf[centroid_idx == 2], axis=0).A1.shape

(7929,)

In [163]:
for ind in range(4):
    m = X_tfidf[centroid_idx == ind]
    print(m.shape)

(1, 7929)
(0, 7929)
(313, 7929)
(3073, 7929)


In [218]:
!pip show threadpoolctl

Name: threadpoolctl
Version: 2.2.0
Summary: threadpoolctl
Home-page: https://github.com/joblib/threadpoolctl
Author: Thomas Moreau
Author-email: thomas.moreau.2010@gmail.com
License: BSD-3-Clause
Location: /Users/hungnguyen/opt/anaconda3/lib/python3.8/site-packages
Requires: 
Required-by: imbalanced-learn, scikit-learn


In [1]:
import threadpoolctl
threadpoolctl.__version__

'3.1.0'

# 1.1 ReLU

In [3]:
import numpy as np

In [4]:
# Neural network
v1 = np.array([0.79, -0.14, -0.13, -0.24, -0.4])
v2 = np.array([-0.77, 0.76, 0.78, -0.51, -0.92])
bv1 = 0.02
bv2 = -0.01

w1 = np.array([0.8, 0.58])
w2 = np.array([0.18, 0.32])
w3 = np.array([0.94, -0.24])
bw1 = 0
bw2 = 0.01
bw3 = 0.03

In [22]:
def ReLU(x):
    if x > 0:
        return (x + abs(x))/2
    else: return 0

# Faster method
def ReLuLu(x):
    return x * (x > 0)

# Sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [5]:
# Experiment setup
# sigmoids represent labels [0, 1, 2]
x = np.array([1, 2, 3, 4, 5])
y_true = 1

In [24]:
# Forward pass #
# Output of layer "v"
out_v1 = np.dot(x, v1) + bv1 # negative num
out_v1 = ReLU(out_v1)

out_v2 = np.dot(x, v2) + bv2 # negative num
out_v2 = ReLU(out_v2)

# Output of layer "w"
x_layer_v = np.array([out_v1, out_v2])

out_w1 = np.dot(x_layer_v, w1) + bw1
out_w1 = sigmoid(out_w1)

out_w2 = np.dot(x_layer_v, w2) + bw2
out_w2 = sigmoid(out_w2)

out_w3 = np.dot(x_layer_v, w3) + bw3
out_w3 = sigmoid(out_w3)

pred_y = np.array([out_w1, out_w2, out_w3])
for i,val in enumerate(pred_y):
    print(i, val)

0 0.5
1 0.5024999791668749
2 0.5074994375506203


In [30]:
# Calculate log loss
def L(x, y):
    return np.dot(-y, np.log(pred_y))

L(pred_y, np.array([0, y_true, 0]))

0.6881596805078625

In [41]:
out_v1

0

In [47]:
# Output of layer "v"
out_v = np.dot(np.array([v1, v2]), x) + np.array([bv1, bv2])
print(out_v)
out_v = np.vectorize(ReLU)(out_v)

# Output of layer "w"
out_w = np.dot(np.array([w1, w2, w3]), out_v) + np.array([bw1, bw2, bw3])
print(out_w)
out_w = np.vectorize(sigmoid)(out_w)



[-2.82 -3.56]


array([0, 0])

In [51]:
np.dot(x , np.array([v1, v2]))

array([-2.84, -3.55])