In [71]:
import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer


def word_tokenizer(text):
        #tokenizes and stems the text
        tokens = word_tokenize(text)
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
        return tokens


def vectorize(sentences):
        tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                        stop_words=stopwords.words('english'),
                                        max_df=0.9,
                                        min_df=0.1,
                                        lowercase=True)
        #builds a tf-idf matrix for the sentences
        tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
        return tfidf_matrix

def cluster_sentences(sentences, nb_of_clusters=5):
        kmeans = KMeans(n_clusters=nb_of_clusters)
        tfidf_matrix = vectorize(sentences)
        kmeans.fit(tfidf_matrix)
        clusters = collections.defaultdict(list)
        for i, label in enumerate(kmeans.labels_):
                clusters[label].append(i)
        return dict(clusters)


In [74]:
        sentences = ["Nature is beautiful","I like green apples",
                    "We should protect the trees","Fruit trees provide fruits",
                    "Green apples are tasty", 'Life is beautiful',
                    'Pineapples are my favorite fruits']
        nclusters= 3
        clusters = cluster_sentences(sentences, nclusters)
        for cluster in range(nclusters):
                print ("cluster ",cluster,":")
                for i,sentence in enumerate(clusters[cluster]):
                        print ("\tsentence ",sentence,": ",sentences[sentence])

cluster  0 :
	sentence  2 :  We should protect the trees
	sentence  3 :  Fruit trees provide fruits
	sentence  6 :  Pineapples are my favorite fruits
cluster  1 :
	sentence  0 :  Nature is beautiful
	sentence  5 :  Life is beautiful
cluster  2 :
	sentence  1 :  I like green apples
	sentence  4 :  Green apples are tasty


In [52]:
>>> from nltk.corpus import reuters
>>> print(reuters.fileids()[-10:])
#['test/14826', 'test/14828', 'test/14829', 'test/14832', ...]
# ['training/9982', 'training/9984', 'training/9985', 'training/9988', 'training/9989', 'training/999', 'training/9992', 'training/9993', 'training/9994', 'training/9995']
#>>> reuters.categories()
#['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa',
#'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn','cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', ...]
>>> reuters.categories(fileids=['test/14829'])
#>>> reuters.sents( categories='acq')


['training/9982', 'training/9984', 'training/9985', 'training/9988', 'training/9989', 'training/999', 'training/9992', 'training/9993', 'training/9994', 'training/9995']


['crude', 'nat-gas']

In [54]:
files = reuters.fileids('coffee')
#>>> reuters.sents(fileids=files[0])
coffee1 = reuters.raw(fileids=files[1])
coffee2 = reuters.raw(fileids=files[2])
coffee3 = reuters.raw(fileids=files[3])

files = reuters.fileids('cotton')
cotton0 = reuters.raw(fileids=files[0])
cotton1 = reuters.raw(fileids=files[1])
cotton2 = reuters.raw(fileids=files[2])

files = reuters.fileids('crude')
crude0 = reuters.raw(fileids=files[0])
crude1 = reuters.raw(fileids=files[1])
crude2 = reuters.raw(fileids=files[2])

sentences = [coffee1, coffee2, coffee3, cotton0, cotton1, cotton2, crude0, crude1, crude2]
nclusters= 3
clusters = cluster_sentences(sentences, nclusters)
for cluster in range(nclusters):
        print ("cluster ",cluster,":")
        for i,sentence in enumerate(clusters[cluster]):
                print ("\tsentence ",i,": ",sentences[sentence])

0 1
1 1
2 1
3 0
4 0
5 0
6 2
7 2
8 2
cluster  0 :
	sentence  0 :  PAKISTAN COTTON CROP SEEN RECORD 7.6 MLN BALES
  Pakistan is likely to produce a record
  7.6 mln bales (375 lbs each) of cotton from the current 1986/87
  crop, exceeding a target of 7.2 mln bales, Food and Agriculture
  Minister Mohammad Ibrahim Baluch said.
      He told a Pakistan Central Cotton Committee meeting here
  the present was the third consecutive poroduction
  record-setting year and said the momentum would be accelerated
  in the future, the official APP news agency reported.
      Baluch said indications were that Pakistan is to attain a
  record cotton production of 7.6 mln bales, compared to the
  1985/86 crop of 7.2 mln bales which also represented the target
  earlier set by authorities for this year's production.
  


	sentence  1 :  CERTIFICATED COTTON STOCKS
  Certificated cotton stocks deliverable
  on the New York Cotton Exchange No 2 cotton futures contract as
  of April 8 were reported at 34,66

In [57]:
>>> from sklearn.decomposition import PCA
tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                            stop_words=stopwords.words('english'),
                                            max_df=0.9,
                                            min_df=0.1,
                                            lowercase=True)
#builds a tf-idf matrix for the sentences
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

>>> from sklearn.decomposition import TruncatedSVD
>>> from sklearn.random_projection import sparse_random_matrix
>>> X = sparse_random_matrix(100, 100, density=0.01, random_state=42)
>>> svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
>>> svd.fit(tfidf_matrix)  
TruncatedSVD(algorithm='randomized', n_components=5, n_iter=7,  random_state=42, tol=0.0)
>>> print(svd.explained_variance_ratio_)  
>>> print(svd.explained_variance_ratio_.sum())  
>>> print(svd.singular_values_)  


[ 0.00735569  0.17575117]
0.183106863748
[ 1.27431521  1.14183419]


In [None]:
>>> import pylab as pl
>>> for i in range(0, pca_2d.shape[0]):
>>> if iris.target[i] == 0:
>>>  c1 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='r',
    marker='+')
>>> elif iris.target[i] == 1:
>>>  c2 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='g',
    marker='o')
>>> elif iris.target[i] == 2:
>>>  c3 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='b',
    marker='*')
>>> pl.legend([c1, c2, c3], ['Setosa', 'Versicolor',
    'Virginica'])
>>> pl.title('Iris dataset with 3 clusters and known
    outcomes')
>>> pl.show()

In [58]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups = fetch_20newsgroups(subset="train", categories=categories)
vectors = TfidfVectorizer().fit_transform(newsgroups.data)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [60]:
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
X_reduced = TruncatedSVD(n_components=50, random_state=0).fit_transform(vectors)
X_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_reduced)


[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 2034 samples in 0.006s...
[t-SNE] Computed neighbors for 2034 samples in 0.429s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2034
[t-SNE] Computed conditional probabilities for sample 2000 / 2034
[t-SNE] Computed conditional probabilities for sample 2034 / 2034
[t-SNE] Mean sigma: 0.107856
[t-SNE] Computed conditional probabilities in 0.102s
[t-SNE] Iteration 50: error = 73.6986389, gradient norm = 0.1196073 (50 iterations in 5.483s)
[t-SNE] Iteration 100: error = 73.9073334, gradient norm = 0.1122805 (50 iterations in 4.767s)
[t-SNE] Iteration 150: error = 73.3661728, gradient norm = 0.0849126 (50 iterations in 3.497s)
[t-SNE] Iteration 200: error = 73.0950623, gradient norm = 0.1006097 (50 iterations in 3.199s)
[t-SNE] Iteration 250: error = 73.4374695, gradient norm = 0.1069705 (50 iterations in 3.163s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 73.437469
[t-SNE] Iteration 300: err

In [68]:
!conda update --all

Solving environment: / 
  - conda-forge::bleach-1.5.0-py36_0
  - defaults::bleach-1.5.0-py36done

## Package Plan ##

  environment location: /Users/mac/anaconda3


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    terminado-0.8.1            |           py36_1          21 KB
    lzo-2.10                   |       h362108e_2         190 KB
    glib-2.56.1                |       h35bc53a_0         4.7 MB
    numpy-1.14.5               |   py36h648b28d_4          36 KB
    certifi-2018.4.16          |           py36_0         142 KB
    openpyxl-2.5.4             |           py36_0         321 KB
    absl-py-0.2.2              |           py36_0         135 KB
    wheel-0.31.1               |           py36_0          62 KB
    flask-1.0.2                |           py36_1         119 KB
    termcolor-1.1.0            |           py36_1           7 KB
    astropy-3.0.3              |   py36h1

    pandas:                             0.20.3-py36hd6655d8_2              --> 0.23.2-py36h6440ff4_0  
    pandoc:                             1.19.2.1-ha5e8f32_1                --> 2.2.1-h1a437c5_0       
    pandocfilters:                      1.4.2-py36h3b0b094_1               --> 1.4.2-py36_1           
    path.py:                            10.3.1-py36hd33c240_0              --> 11.0.1-py36_0          
    pathlib2:                           2.3.0-py36h877a6d8_0               --> 2.3.2-py36_0           
    patsy:                              0.4.1-py36ha1b3fa5_0               --> 0.5.0-py36_0           
    pcre:                               8.41-h29eefc5_0                    --> 8.42-h378b8a2_0        
    pep8:                               1.7.0-py36hc268eb1_0               --> 1.7.1-py36_0           
    pexpect:                            4.2.1-py36h3eac828_0               --> 4.6.0-py36_0           
    pillow:                             4.2.1-py36h0263179_0    

Proceed ([y]/n)? ^C


In [70]:
!conda install -c conda-forge basemap fiona

Solving environment: done

## Package Plan ##

  environment location: /Users/mac/anaconda3

  added / updated specs: 
    - basemap
    - fiona


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    libpq-9.6.3                |                0          89 KB  conda-forge
    openjpeg-2.3.0             |                2         401 KB  conda-forge
    glib-2.55.0                |                0         4.8 MB  conda-forge
    giflib-5.1.4               |       h470a237_0          72 KB  conda-forge
    gdal-2.2.4                 |           py36_0         845 KB  conda-forge
    click-plugins-1.0.3        |             py_1           8 KB  conda-forge
    libnetcdf-4.6.1            |                2         1.2 MB  conda-forge
    geotiff-1.4.2              |                1         1.1 MB  conda-forge
    xerces-c-3.2.0             |                0         3.5 MB  conda-forge
    fio

In [67]:
import numpy as np
from matplotlib.pyplot import *

fig = figure(figsize=(10, 10))
ax = axes(frameon=False)
setp(ax, xticks=(), yticks=())
subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
scatter(X_embedded[:, 0], X_embedded[:, 1],
        c=newsgroups.target, marker="x")

ImportError: dlopen(/Users/mac/anaconda3/lib/python3.6/site-packages/matplotlib/ft2font.cpython-36m-darwin.so, 2): Library not loaded: @rpath/libpng16.16.dylib
  Referenced from: /Users/mac/anaconda3/lib/libfreetype.6.dylib
  Reason: Incompatible library version: libfreetype.6.dylib requires version 51.0.0 or later, but libpng16.16.dylib provides version 49.0.0

In [20]:
from nltk.corpus import reuters


sentences = ["Nature is beautiful","I like green apples",
                    "We should protect the trees","Fruit trees provide fruits",
                    "Green apples are tasty", 'Life is beautiful',
                    'Pineapples are my favorite fruits']
nclusters= 3
clusters = cluster_sentences(sentences, nclusters)
for cluster in range(nclusters):
        print ("cluster ",cluster,":")
        for i,sentence in enumerate(clusters[cluster]):
                print ("\tsentence ",i,": ",sentences[sentence])

cluster  0 :
	sentence  0 :  I like green apples
	sentence  1 :  Green apples are tasty
cluster  1 :
	sentence  0 :  We should protect the trees
	sentence  1 :  Fruit trees provide fruits
	sentence  2 :  Pineapples are my favorite fruits
cluster  2 :
	sentence  0 :  Nature is beautiful
	sentence  1 :  Life is beautiful


In [12]:
#!/usr/bin/env python3

import nltk
import pickle

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

DOC_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.json'
PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'
CAT_PATTERN = r'([a-z_\s]+)/.*'


class PickledCorpusReader(CategorizedCorpusReader, CorpusReader):

    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

    def _resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. This primarily bubbles up to
        the high level ``docs`` method, but is implemented here similar to
        the nltk ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the BaleenCorpusReader, this uses a generator
        to acheive memory safe iteration.
        """
        # Resolve the fileids and the categories
        fileids = self._resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def paras(self, fileids=None, categories=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for doc in self.docs(fileids, categories):
            for paragraph in doc:
                yield paragraph

    def sents(self, fileids=None, categories=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for paragraph in self.paras(fileids, categories):
            for sentence in paragraph:
                yield sentence

    def words(self, fileids=None, categories=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for sentence in self.sents(fileids, categories):
            for token in sentence:
                yield token


if __name__ == '__main__':
    from collections import Counter

    corpus = PickledCorpusReader('../corpus')
    words  = Counter(corpus.words())

    print("{:,} vocabulary {:,} word count".format(len(words.keys()), sum(words.values())))


0 vocabulary 0 word count


In [8]:
import os
import nltk
import unicodedata
import numpy as np

from itertools import groupby
from operator import itemgetter

from reader import PickledCorpusReader

from nltk.corpus import wordnet as wn
from nltk.cluster import KMeansClusterer

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.WordNetLemmatizer()


def is_punct(token):
    # Is every character punctuation?
    return all(
        unicodedata.category(char).startswith('P')
        for char in token
    )


def wnpos(tag):
    # Return the WordNet POS tag from the Penn Treebank tag
    return {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(tag[0], wn.NOUN)


def normalize(document, stopwords=STOPWORDS):
    """
    Removes stopwords and punctuation, lowercases, lemmatizes
    """

    for token, tag in document:
        token = token.lower().strip()

        if is_punct(token) or (token in stopwords):
            continue

        yield lemmatizer.lemmatize(token, wnpos(tag))


class KMeansTopics(object):

    def __init__(self, corpus, k=10):
        """
        corpus is a corpus object, e.g. an HTMLCorpusReader()
        or an HTMLPickledCorpusReader() object

        k is the number of clusters
        """
        self.k = k
        self.model = None
        self.vocab = list(
            set(normalize(corpus.words(categories=['news'])))
            )

    def vectorize(self, document):
        """
        Vectorizes a document consisting of a list of part of speech
        tagged tokens using the segmentation and tokenization methods.

        One-hot encode the set of documents
        """
        features = set(normalize(document))
        return np.array([
            token in features for token in self.vocab], np.short)

    def cluster(self, corpus):
        """
        Fits the K-Means model to the given data.
        """
        cosine = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(
            self.k, distance=cosine, avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(
                corpus.words(fileid)
            ) for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))

if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    clusterer = KMeansTopics(corpus, k=7)
    clusterer.cluster(corpus)

    # Classify documents in the new corpus by cluster affinity
    groups  = [
        (clusterer.classify(corpus.words(fileid)), fileid)
        for fileid in corpus.fileids(categories=['news'])
    ]

    # Group documents in corpus by cluster and display them
    groups.sort(key=itemgetter(0))
    for group, items in groupby(groups, key=itemgetter(0)):
        for cluster, fname in items:
            print("Cluster {}: {}".format(cluster+1,fname))


ModuleNotFoundError: No module named 'reader'

In [2]:
!pip install reader

Collecting reader
[31m  Could not find a version that satisfies the requirement reader (from versions: )[0m
[31mNo matching distribution found for reader[0m
