In [1]:
#The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents,
#partitioned (nearly) evenly across 20 different newsgroups. To the best of our knowledge,
#it was originally collected by Ken Lang, probably for his paper 
#“Newsweeder: Learning to filter netnews,” though he does not explicitly mention this
#collection.
#The 20 newsgroups collection has become a popular data set for experiments
#in text applications of machine learning techniques, such as text classification 
#and text clustering.

In [12]:
# selecting labels 
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [13]:
import sklearn
from sklearn.datasets import fetch_20newsgroups

dataset = sklearn.datasets.load_files("../text_analytics/data/twenty_newsgroups/20news-bydate-train/", categories = categories, load_content=True, shuffle=True, encoding='UTF-8', decode_error='replace', random_state=0)

In [24]:
# Tokenisation
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words=None)
X_train_counts = count_vect.fit_transform(dataset.data)


In [16]:
# How many times does GPU appear?
# How many words in total does your dictionary have?
# Which words are the most common?

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Feature representation

#TF_IDF
tfidf_transformer = TfidfVectorizer(stop_words=None)
X_train_tfidf = tfidf_transformer.fit_transform(dataset.data)


In [20]:
import numpy as np

dataset_test = sklearn.datasets.load_files("../text_analytics/data/twenty_newsgroups/20news-bydate-test/", categories=categories, load_content=True, shuffle=True, encoding='UTF-8', decode_error='replace', random_state=42)

X_test = tfidf_transformer.transform(dataset_test.data)

labels = dataset_test.target
true_k = np.unique(labels).shape[0]

In [21]:
from sklearn.cluster import KMeans
# TODO: perform KMeans clustering
km = KMeans(n_clusters=..., init='k-means++', max_iter=100, n_init=1)
km.fit(...)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=4, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [22]:
from sklearn import metrics
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X_test, km.labels_, sample_size=1000))

# TODO: What do these measures mean?

Homogeneity: 0.428
Completeness: 0.453
V-measure: 0.440
Adjusted Rand-Index: 0.418
Silhouette Coefficient: 0.005


In [23]:
print("Top terms per cluster:")
# TODO: top words per cluster
order_centroids = ...

terms = count_vect.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])


Top terms per cluster:
Cluster 0:
 homosexuality
 homosexual
 church
 god
 paul
 edu
 christianity
 gay
 issues
 sin
Cluster 1:
 god
 edu
 mary
 sin
 com
 religion
 people
 don
 christian
 believe
Cluster 2:
 com
 edu
 article
 writes
 medical
 subject
 lines
 sgi
 organization
 cancer
Cluster 3:
 edu
 graphics
 university
 subject
 lines
 organization
 image
 computer
 ac
 thanks


In [29]:
# TODO: 
# How does it perform?
# What happens if you limit the feature space? e.g. Exclude certain stop words?

# TODO:
# Can you load other categories?
# Which categories are the most easy to cluster?

# TODO:
# What happens if we don't know the number of clusters? 

In [30]:
# OTHER THOUGHTS:
# Can you cluster based in similarity? (Cosine distance)
# What's a nice way to visualise this data?