# We try text clustering, with news groups data. 

The tutorial has been taken from http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#sphx-glr-auto-examples-text-document-clustering-py

Author of the turorial
###### Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
######         Lars Buitinck
###### License: BSD 3 clause


### Load some categories data from the training set

In [25]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

# Uncomment the following to do the analysis on all the categories
# categories = None

In [26]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)


from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
3387 documents
4 categories



In [27]:
labels = dataset.target
labels[1:10]

array([1, 1, 3, 1, 1, 2, 2, 2, 0])

In [28]:
import numpy as np
print(np.unique(labels))
true_k = np.unique(labels).shape[0]
true_k

[0 1 2 3]


4

In [29]:
print("Extracting features from the training dataset using a sparse vectorizer")

from time import time
t0 = time()

from sklearn.feature_extraction.text import TfidfVectorizer

use_idf = True
n_features = 10

vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features,
                                 min_df=2, stop_words='english',
                                 use_idf=use_idf)

X = vectorizer.fit_transform(dataset.data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

Extracting features from the training dataset using a sparse vectorizer
done in 0.633680s
n_samples: 3387, n_features: 10



In [30]:
#Do LSA ?

from sklearn.cluster import KMeans, MiniBatchKMeans
verbose = True
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,  verbose=verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=4, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)
Initialization complete
Iteration  0, inertia 2405.973
Iteration  1, inertia 1857.665
Iteration  2, inertia 1821.776
Iteration  3, inertia 1757.259
Iteration  4, inertia 1701.503
Iteration  5, inertia 1683.263
Iteration  6, inertia 1677.052
Iteration  7, inertia 1675.694
Iteration  8, inertia 1675.098
Iteration  9, inertia 1674.673
Iteration 10, inertia 1674.584
Iteration 11, inertia 1674.180
Iteration 12, inertia 1673.973
Iteration 13, inertia 1673.932
Iteration 14, inertia 1673.893
Iteration 15, inertia 1673.861
Iteration 16, inertia 1673.702
Iteration 17, inertia 1673.675
Iteration 18, inertia 1673.634
Converged at iteration 18: center shift 4.060555e-06 within tolerance 6.970741e-06
done in 0.121s



In [31]:
from sklearn import metrics
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

Homogeneity: 0.025
Completeness: 0.026
V-measure: 0.026
Adjusted Rand-Index: 0.024
Silhouette Coefficient: 0.191

