<a href="https://colab.research.google.com/github/kleczekr/tolkenizer/blob/master/foxbook_hierarchical_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from nltk.cluster import KMeansClusterer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.cluster import AgglomerativeClustering

In [None]:
class HierarchicalClusters(object):
  def __init__(self):
    self.model = AgglomerativeClustering()
  def fit(self, documents, labels=None):
    return self
  def transform(self, documents):
    '''
    fits the agglomerative model to the given data
    '''
    clusters = self.model.fit_predict(documents)
    self.labels = self.model.labels_
    self.children = self.model.children_
    return clusters

In [None]:
class TextNormalizer(BaseEstimator, TransformerMixin):
  def __init__(self, language='english'):
    self.stopwords = set(nltk.corpus.stopwords.words(language))
    self.lemmatizer = WordNetLemmatizer()
  def is_punct(self, token):
    return all(unicodedata.category(char).startswith('P') for char in token)
  def is_stopword(self, token):
    return token.lower() in self.stopwords
  def normalize(self, document):
    return [
            self.lemmatize(token, tag).lower()
            for paragraph in document
            for sentence in paragraph
            for (token, tag) in sentence
            if not self.is_punct(token) and not self.is_stopword(token)
    ]
  def lemmatize(self, token, pos_tag):
    tag = {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(pos_tag[0], wn.NOUN)
    return self.lemmatizer.lemmatize(token, tag)
  def fit(self, X, y=None):
    return self
  def transform(self, documents):
    return [' '.join(self.normalize(doc)) for doc in documents]

In [None]:
class OneHotVectorizer(BaseEstimator, TransformerMixin):
  def __init__(self):
    self.vectorizer = CountVectorizer(binary=True)
  def fit(self, documents, labels=None):
    return self
  def transform(self, documents):
    freqs = self.vectorizer.fit_transform(documents)
    return [freq.toarray()[0] for freq in freqs]

In [None]:
corpus = PickledCorpusReader('../corpus')
docs = corpus.docs(categories=['news'])

In [None]:
model = Pipeline([
                  ('norm', TextNormalizer()),
                  ('vect', OneHotVectorizer()),
                  ('clusters', HierarchicalClusters())
])

model.fit_transform(docs)
labels = model.named_steps['clusters'].labels
pickles = list(corpus.fileids(categories=['news']))

for idx, fileid in enumerate(pickles):
  print('Document "{}" assigned to cluster {}.'.format(fileid, labels[idx]))

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

In [None]:
def plot_dendrogram(children, **kwargs):
  # distances btw each pair of children
  distance = position = np.arrange(children.shape[0])

  # create linkage matrix and then plot the dendrogram
  linkage_matrix = np.column_stack([
                                    children, distance, position
  ]).astype(float)

  # plot the corresponding dendrogram
  fig, ax = plt.subplots(figsize=(10, 5)) # set size
  ax = dendrogram(linkage_matrix, **kwargs)
  plt.tick_params(axis='x', bottom='off', top='off', labelbottom='off')
  plt.tight_layout()
  plt.show()

In [None]:
children = model.named_steps['clusters'].children
plot_dendrogram(children)