In [2]:
import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import reuters
import random
import numpy as np
import math

def word_tokenizer(text):
        #tokenizes and stems the text
        tokens = word_tokenize(text)
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
        return tokens


def vectorize(sentences):
        tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                        stop_words=stopwords.words('english'),
                                        max_df=0.9,
                                        min_df=0.1,
                                        lowercase=True)
        #builds a tf-idf matrix for the sentences
        tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
        return tfidf_matrix

    
def k_means_cluster(sentences, nb_of_clusters=3, initialize='random'):
        kmeans = KMeans(n_clusters=nb_of_clusters, init=initialize, n_init=1, )
        tfidf_matrix = vectorize(sentences)
        tfidf_array = tfidf_matrix.toarray()
        kmeans.fit(tfidf_array)
        clusters = collections.defaultdict(list)
        for i, label in enumerate(kmeans.labels_):
                clusters[label].append(i)
        return dict(clusters)


def euc_distance(a, b):
    total = 0
    if len(a) == len(b):
        for i in range(len(a)):
            total += math.pow((a[i] - b[i]), 2)
        return math.sqrt(total)

    else:
        print("a, b not same length")
        return False


def get_distance(array, centers): # return(distance, index of cluster)
    distances=[]
    for center_id in range(len(centers)):
        dist = euc_distance(array, centers[center_id])
        distances.append((dist, center_id))
    return distances



def find_center(tfidf_array, _cluster_id, num_of_clusters):
    cluster_count = np.zeros(num_of_clusters)
    new_centers = np.array([np.zeros(len(tfidf_array[0])) for i in range(num_of_clusters)])

    for i in range(len(tfidf_array)):
        id  = int(_cluster_id[i])
        cluster_count[id] += 1
        new_centers[id] = new_centers[id] + tfidf_array[i]

    for i in range(num_of_clusters):
        if cluster_count[i] != 0:
            new_centers[i] = new_centers[i] / cluster_count[i]

    return new_centers



def find_cluster(tfidf_array, centers):
    new_cluster_ = np.zeros(len(tfidf_array))  # [0 for i in range(len(tfidf_array))]
    for i in range(len(tfidf_array)):
        distances = get_distance(tfidf_array[i], centers)  # return (distance, center_id)
        distances = sorted(distances, key=lambda x: x[0])
        cluster_id = distances[0][1]
        new_cluster_[i] = cluster_id
    return new_cluster_



def my_k_means_cluster(sentences, num_of_clusters, initialize):

    # vectorize
    tfidf_matrix = vectorize(sentences)
    tfidf_array = tfidf_matrix.toarray()
    num_of_data = len(tfidf_array)

    # initialize center
    if (type(initialize) == str ) and (initialize == 'random'):
        cluster_center_index = random.sample(range(num_of_data), num_of_clusters)
        cluster_center = [tfidf_array[idx] for idx in cluster_center_index]
    else:
        cluster_center = initialize # center 고정

    # cluster init to -1
    cluster_id = np.zeros(num_of_data) - 1
    # find new cluster
    new_cluster_id = find_cluster(tfidf_array, cluster_center)

    # find new center and new cluster until convergence
    while not np.array_equal(cluster_id, new_cluster_id):
        cluster_center = find_center(tfidf_array, new_cluster_id, num_of_clusters)
        cluster_id = new_cluster_id.copy()
        new_cluster_id = find_cluster(tfidf_array, cluster_center)

    return new_cluster_id  #



In [None]:
#RANDOM INIT

files = reuters.fileids('coffee')
coffee0 = reuters.raw(fileids=files[0])
coffee1 = reuters.raw(fileids=files[1])
coffee2 = reuters.raw(fileids=files[2])
coffee3 = reuters.raw(fileids=files[3])

files = reuters.fileids('cotton')
cotton0 = reuters.raw(fileids=files[0])
cotton1 = reuters.raw(fileids=files[1])
cotton2 = reuters.raw(fileids=files[2])
cotton3 = reuters.raw(fileids=files[3])

files = reuters.fileids('crude')
crude0 = reuters.raw(fileids=files[0])
crude1 = reuters.raw(fileids=files[1])
crude2 = reuters.raw(fileids=files[2])
crude3 = reuters.raw(fileids=files[3])

sentences = [coffee0, coffee1, coffee2, cotton0, cotton1, cotton2, crude0, crude1, crude2]
nclusters= 3
init= 'random'

my_clusters = my_k_means_cluster(sentences, nclusters, init)
print("my_kmeans", my_clusters)

sklearn_clusters = k_means_cluster(sentences, nclusters, init)
print("sklearn", sklearn_clusters)


#
# for cluster in range(nclusters):
#     print ("cluster ",cluster,":")
#     for i,sentence in enumerate(sklearn_clusters[cluster]):
#         print ("\tsentence ",sentence,": ",sentences[sentence][:30])



In [None]:
files = reuters.fileids('coffee')
coffee0 = reuters.raw(fileids=files[0])
coffee1 = reuters.raw(fileids=files[1])
coffee2 = reuters.raw(fileids=files[2])
coffee3 = reuters.raw(fileids=files[3])

files = reuters.fileids('cotton')
cotton0 = reuters.raw(fileids=files[0])
cotton1 = reuters.raw(fileids=files[1])
cotton2 = reuters.raw(fileids=files[2])
cotton3 = reuters.raw(fileids=files[3])

files = reuters.fileids('crude')
crude0 = reuters.raw(fileids=files[0])
crude1 = reuters.raw(fileids=files[1])
crude2 = reuters.raw(fileids=files[2])
crude3 = reuters.raw(fileids=files[3])

sentences = [coffee0, coffee1, coffee2, cotton0, cotton1, cotton2, crude0, crude1, crude2]
nclusters= 3

## CENTER NOT RANDOM 
tfidf_matrix = vectorize(sentences)
tfidf_array = tfidf_matrix.toarray()
init = np.array([tfidf_array[2], tfidf_array[4], tfidf_array[8]])


my_clusters = my_k_means_cluster(sentences, nclusters, init)
print("my_kmeans", my_clusters)

sklearn_clusters = k_means_cluster(sentences, nclusters, init)
print("sklearn", sklearn_clusters)


#
# for cluster in range(nclusters):
#     print ("cluster ",cluster,":")
#     for i,sentence in enumerate(sklearn_clusters[cluster]):
#         print ("\tsentence ",sentence,": ",sentences[sentence][:30])

