In [11]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.datasets
import nltk
import nltk.stem

# 20 newsgroup dataset

## Load dataset

In [4]:
groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
trainData = sklearn.datasets.fetch_20newsgroups(subset = 'train', categories = groups)

## Define encoder

In [17]:
stemmer = nltk.stem.SnowballStemmer('english')

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(word) for word in analyzer(doc))

vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, stop_words='english')
dataVectors = vectorizer.fit_transform(trainData['data'])
dataVectors.shape

## Define similarity measure

## Modelling with KMeans

In [19]:
nClusters = 50
km = KMeans(n_clusters=nClusters, init='random', n_init=1, verbose=1)
km.fit(dataVectors)

Initialization complete
Iteration 0, inertia 5921.767135688507
Iteration 1, inertia 3215.062227225114
Iteration 2, inertia 3172.2057626828664
Iteration 3, inertia 3149.5092642419786
Iteration 4, inertia 3138.939692352874
Iteration 5, inertia 3133.0219739385493
Iteration 6, inertia 3128.7663891691113
Iteration 7, inertia 3126.3121061730603
Iteration 8, inertia 3124.2389798743498
Iteration 9, inertia 3122.91414161452
Iteration 10, inertia 3122.0468986096666
Iteration 11, inertia 3121.0291502614177
Iteration 12, inertia 3120.268315617974
Iteration 13, inertia 3119.629648173989
Iteration 14, inertia 3118.7314949308816
Iteration 15, inertia 3117.6938513811233
Iteration 16, inertia 3116.56076727272
Iteration 17, inertia 3115.7329367721827
Iteration 18, inertia 3115.104392966361
Iteration 19, inertia 3114.0296674374285
Iteration 20, inertia 3112.9785773058657
Iteration 21, inertia 3112.4791160034565
Iteration 22, inertia 3112.1024944561605
Iteration 23, inertia 3111.350355611858
Iteration 24,

KMeans(init='random', n_clusters=50, n_init=1, verbose=1)

In [20]:
km.labels_

array([25, 33, 13, ..., 42, 21,  0])

In [21]:
km.cluster_centers_

array([[0.        , 0.        , 0.00194577, ..., 0.        , 0.        ,
        0.        ],
       [0.00421971, 0.00118877, 0.00034221, ..., 0.00070723, 0.        ,
        0.00097567],
       [0.00696256, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00219239, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01232016, 0.0012089 , 0.00212849, ..., 0.00030275, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Testing with KMeans

In [24]:
testText = 'Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it doesn\'t boot any more. Any ideas? Thanks.'
newDataVector = vectorizer.transform([testText])
newDataLabel = km.predict(newDataVector)[0]
newDataLabel

5

In [34]:
clusterItems = (km.labels_ == newDataLabel).nonzero()[0]