In [1]:
import graphlab

'''Check GraphLab Create version'''
from distutils.version import StrictVersion
assert (StrictVersion(graphlab.version) >= StrictVersion('1.8.5')), 'Updated GraphLab'

In [3]:
from em_utilities import *

In [4]:
wiki = graphlab.SFrame('people_wiki.gl/').head(5000)
wiki['tf_idf'] = graphlab.text_analytics.tf_idf(wiki['text'])

This non-commercial license of GraphLab Create for academic use is assigned to pukaracharya2052@gmail.com and will expire on July 25, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1510255961.log


In [5]:
tf_idf, map_index_to_word = sframe_to_scipy(wiki, 'tf_idf')

In [6]:
tf_idf = normalize(tf_idf)

In [7]:
for i in range(5):
    doc = tf_idf[i]
    print(np.linalg.norm(doc.todense()))

1.0
1.0
1.0
1.0
1.0


In [8]:
from sklearn.cluster import KMeans

np.random.seed(5)
num_clusters = 25

# Use scikit-learn's k-means to simplify workflow
kmeans_model = KMeans(n_clusters=num_clusters, n_init=5, max_iter=400, random_state=1, n_jobs=-1)
kmeans_model.fit(tf_idf)
centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_

means = [centroid for centroid in centroids]

In [9]:
cluster_assignment

array([22, 23, 15, ...,  2,  2,  8], dtype=int32)

In [10]:
num_docs = tf_idf.shape[0]
weights = []
for i in xrange(num_clusters):
    # Compute the number of data points assigned to cluster i:
    num_assigned = cluster_assignment[cluster_assignment == i].shape[0] # YOUR CODE HERE
    w = float(num_assigned) / num_docs
    weights.append(w)

In [11]:
covs = []
for i in xrange(num_clusters):
    member_rows = tf_idf[cluster_assignment==i]
    cov = (member_rows.power(2) - 2*member_rows.dot(diag(means[i]))).sum(axis=0).A1 / member_rows.shape[0] \
          + means[i]**2
    cov[cov < 1e-8] = 1e-8
    covs.append(cov)

In [12]:
out = EM_for_high_dimension(tf_idf, means, covs, weights, cov_smoothing=1e-10)

In [13]:
out['loglik']

[3855847476.7012835, 4844053202.46348, 4844053202.46348]

In [14]:
len(out['means'])

25

In [15]:
def visualize_EM_clusters(tf_idf, means, covs, map_index_to_word):
    print('')
    print('==========================================================')

    num_clusters = len(means)
    for c in xrange(num_clusters):
        print('Cluster {0:d}: Largest mean parameters in cluster '.format(c))
        print('\n{0: <12}{1: <12}{2: <12}'.format('Word', 'Mean', 'Variance'))
        
        # The k'th element of sorted_word_ids should be the index of the word 
        # that has the k'th-largest value in the cluster mean. Hint: Use np.argsort().
        sorted_word_ids = np.argsort(-means[c])

        for i in sorted_word_ids[:5]:
            print '{0: <12}{1:<10.2e}{2:10.2e}'.format(map_index_to_word['category'][i], 
                                                       means[c][i],
                                                       covs[c][i])
        print '\n=========================================================='

In [16]:
visualize_EM_clusters(tf_idf, out['means'], out['covs'], map_index_to_word)


Cluster 0: Largest mean parameters in cluster 

Word        Mean        Variance    
minister    7.57e-02    7.42e-03
election    5.89e-02    3.21e-03
party       5.89e-02    2.61e-03
liberal     2.93e-02    4.55e-03
elected     2.91e-02    8.95e-04

Cluster 1: Largest mean parameters in cluster 

Word        Mean        Variance    
film        1.76e-01    6.07e-03
films       5.50e-02    2.97e-03
festival    4.66e-02    3.60e-03
feature     3.69e-02    1.81e-03
directed    3.39e-02    2.22e-03

Cluster 2: Largest mean parameters in cluster 

Word        Mean        Variance    
art         1.26e-01    6.83e-03
museum      5.62e-02    7.27e-03
gallery     3.65e-02    3.40e-03
artist      3.61e-02    1.44e-03
design      3.20e-02    4.59e-03

Cluster 3: Largest mean parameters in cluster 

Word        Mean        Variance    
basketball  1.86e-01    7.78e-03
nba         1.01e-01    1.22e-02
points      6.25e-02    5.92e-03
coach       5.57e-02    5.91e-03
team        4.68e-02    1.30e

In [17]:
np.random.seed(5) # See the note below to see why we set seed=5.
num_clusters = len(means)
num_docs, num_words = tf_idf.shape

random_means = []
random_covs = []
random_weights = []

for k in range(num_clusters):
    
    # Create a numpy array of length num_words with random normally distributed values.
    # Use the standard univariate normal distribution (mean 0, variance 1).
    # YOUR CODE HERE
    mean = np.random.normal(0, 1, num_words)
    
    # Create a numpy array of length num_words with random values uniformly distributed between 1 and 5.
    # YOUR CODE HERE
    cov = np.random.uniform(1,6,num_words)

    # Initially give each cluster equal weight.
    # YOUR CODE HERE
    weight = 1
    
    random_means.append(mean)
    random_covs.append(cov)
    random_weights.append(weight)

In [18]:
out_random_init = EM_for_high_dimension(tf_idf, random_means, random_covs, random_weights, cov_smoothing=1e-5)

In [19]:
out_random_init['loglik']

[-793165403.68923426,
 2282407852.9796767,
 2362262754.345324,
 2362514453.9992857,
 2362514453.9995394,
 2362514453.9995394]

In [20]:
visualize_EM_clusters(tf_idf, out_random_init['means'], out_random_init['covs'], map_index_to_word)


Cluster 0: Largest mean parameters in cluster 

Word        Mean        Variance    
she         3.90e-02    5.42e-03
her         2.54e-02    2.14e-03
music       2.12e-02    2.34e-03
singapore   1.77e-02    5.52e-03
bbc         1.17e-02    1.83e-03

Cluster 1: Largest mean parameters in cluster 

Word        Mean        Variance    
she         1.63e-02    2.46e-03
he          1.35e-02    1.09e-04
music       1.16e-02    1.12e-03
university  1.06e-02    3.07e-04
her         1.03e-02    8.35e-04

Cluster 2: Largest mean parameters in cluster 

Word        Mean        Variance    
she         3.12e-02    3.56e-03
her         2.41e-02    2.52e-03
music       1.51e-02    1.44e-03
he          1.10e-02    1.16e-04
festival    1.07e-02    2.03e-03

Cluster 3: Largest mean parameters in cluster 

Word        Mean        Variance    
she         2.70e-02    3.39e-03
her         1.81e-02    1.56e-03
film        1.48e-02    2.16e-03
series      1.06e-02    5.52e-04
physics     1.05e-02    4.08e