# Document Clustering by DEC(Deep Embedded Clustering)

### References

* Preprocess: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
* DEC: https://github.com/Tony607/Keras_Deep_Clustering/blob/master/Keras-DEC.ipynb
* overall : https://github.com/madigun697/data_analysis/blob/master/Topic_Modeling/Topic_Modeling_DEC_Drug.ipynb

## Preprocess data

In [0]:
import pandas as pd
import numpy as np
import nltk
import re

pd.set_option('display.max_colwidth', 999)
drug_data = pd.read_csv('../data/titles_condition_by_t.tsv', sep='\t', header=None)

In [0]:
drug_data.columns = ['id', 'title']
drug_data.head()

Unnamed: 0,id,title
0,4106,Analysis of efficacy
1,4107,Comparisons of postoperative CA19-9 levels on survival of ESPAC-4 with the CONOKO-01 and JASPAC-1 trials
2,4108,Pattern of disease relapse
3,4109,Grade 1–5 adverse events with gemcitabine alone and gemcitabine plus capecitabine
4,4112,Treatment with zoledronic acid


In [0]:
drug_data.title = drug_data.title.str.strip()
drug_data['title'].replace('', np.nan, inplace=True)
print(drug_data.isna().any())

id       False
title    False
dtype: bool


In [0]:
# rep = {'nbsp':'', 'table':'', 'legend':'', 'mg/dl':'', 'g/l':'', 'yrs':'year', '\n':' ', ';':'', 'kg/m2':'', 'n=':''}
rep = {'nbsp':'', 'table':'', 'legend':'', 'yrs':'year', '\n':' '}
# clean_content = [pattern.sub(lambda m: rep[re.escape(m.group(0))], x['content']) for x in train_data]
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
drug_data.title = [pattern.sub(lambda m: rep[re.escape(m.group(0))], str(x)) for x in drug_data.title]

In [0]:
drug_data.title[:10]

0                                                                                        Analysis of efficacy
1    Comparisons of postoperative CA19-9 levels on survival of ESPAC-4 with the CONOKO-01 and JASPAC-1 trials
2                                                                                  Pattern of disease relapse
3                           Grade 1–5 adverse events with gemcitabine alone and gemcitabine plus capecitabine
4                                                                              Treatment with zoledronic acid
5                                                                                    Treatment with docetaxel
6                                Treatments ever used at relapse, at the discretion of the treating clinician
7                                            Worst adverse event  (grade)  reported over entire time on trial
8                                                        Chemotherapy delivery and trial drug discontinuation
9         

In [0]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import strip_numeric
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [0]:
np.random.seed(2018)

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/grace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
#stopwords
stemmer = SnowballStemmer('english')
STOP_WORDS = list(gensim.parsing.preprocessing.STOPWORDS)
STOP_WORDS.extend(['table', 'legend'])

In [0]:
# stemming
# -porter stemmer
# -lancaster stemmer
# -snowball stemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
#     deacc=True removes punctuations
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if token not in STOP_WORDS and len(token)>1:
#             result.append(lemmatize_stemming(strip_numeric(token)))
            result.append(lemmatize_stemming(token))
    return result

def preprocess_token_only(text):
    result = []
#     deacc=True removes punctuations
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if token not in STOP_WORDS and len(token)>1:
#             result.append(lemmatize_stemming(strip_numeric(token)))
            result.append(token)
    return result

stemmer = LancasterStemmer()
#tokenizing
def tokenize_and_stem(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    #filter tokens not containing letters
    filtered = []
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token)>2:
            filtered.append(token)
#     stems = [stemmer.stem(t, pos='v') for t in filtered]
    stems = [stemmer.stem(t) for t in filtered]
    return stems

def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered = []
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token)>2:
            filtered.append(token)
    return filtered

In [0]:
%time drug_data['processed'] = drug_data['title'].map(preprocess)
drug_data.processed[:10]

CPU times: user 11.2 s, sys: 154 ms, total: 11.4 s
Wall time: 12 s


0                                                        [analys, eff]
1    [comparison, postop, ca, level, surv, espac, conoko, jaspac, tri]
2                                            [pattern, diseas, relaps]
3         [grad, advers, ev, gemcitabin, gemcitabin, plu, capecitabin]
4                                                [tre, zoledron, acid]
5                                                     [tre, docetaxel]
6                                    [tre, relaps, discret, tre, clin]
7                   [worst, advers, ev, grad, report, entir, tim, tri]
8                      [chemotherapy, delivery, tri, drug, discontinu]
9                                                         [advers, ev]
Name: processed, dtype: object

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.manifold import TSNE

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, 
                                   max_features=10000, 
                                   min_df=0.01, 
                                   stop_words='english', 
                                   use_idf=True, 
                                   lowercase=True, 
                                   tokenizer=preprocess)
#                                    tokenizer=preprocess, ngram_range=(1,2))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(drug_data.title)
print(tfidf_matrix.shape)

CPU times: user 9.89 s, sys: 123 ms, total: 10 s
Wall time: 10.1 s
(27960, 147)


### L2 Normalization
 - L2 Norm은 n 차원(유클리드 공간)에서의 벡터의 크기를 계산함.

In [0]:
%time tfidf_tsne_result = TSNE(learning_rate=300, init='pca')\
                    .fit_transform(np.array(tfidf_matrix.toarray()))
# tfidf_vect = normalize(tfidf_tsne_result, norm='l2')

CPU times: user 15min 16s, sys: 43.7 s, total: 16min
Wall time: 16min 3s


In [0]:
tfidf_tsne_result[:10]

array([[ 60.524284,  20.698196],
       [ 38.65348 ,  30.628326],
       [-21.634258,  37.96718 ],
       [ 43.116405,  69.07914 ],
       [ 65.26964 ,   6.54224 ],
       [ 65.24174 ,   6.496863],
       [ 62.44582 ,   4.066565],
       [ 35.853405,  54.66684 ],
       [ 47.055466,  38.351044],
       [ 42.254856,  69.44895 ]], dtype=float32)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
%time vect.fit([' '.join(d) for d in drug_data.processed])
%time tsne_data = vect.transform([' '.join(d) for d in drug_data.processed]).toarray()
%time tsne_result = TSNE(learning_rate=300, init='pca').fit_transform(np.array(tsne_data))

CPU times: user 286 ms, sys: 10.9 ms, total: 297 ms
Wall time: 297 ms


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

CPU times: user 336 ms, sys: 167 ms, total: 503 ms
Wall time: 504 ms


In [0]:
import numpy as np

from keras.models import Model
from keras import backend as K
from keras import layers
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Flatten, Reshape, Conv2DTranspose
from keras.models import Model
from keras.engine.topology import Layer, InputSpec

# from sklearn.cluster import KMeans
# from keras.datasets import mnist
from sklearn import metrics

np.random.seed(0)

In [0]:
class ClusteringLayer(Layer):
    """
    Clustering layer converts input sample (feature) to soft label, i.e. a vector that represents the probability of the
    sample belonging to each cluster. The probability is calculated with student's t-distribution.

    # Example
    ```
        model.add(ClusteringLayer(n_clusters=10))
    ```
    # Arguments
        n_clusters: number of clusters.
        weights: list of Numpy array with shape `(n_clusters, n_features)` witch represents the initial cluster centers.
        alpha: degrees of freedom parameter in Student's t-distribution. Default to 1.0.
    # Input shape
        2D tensor with shape: `(n_samples, n_features)`.
    # Output shape
        2D tensor with shape: `(n_samples, n_clusters)`.
    """

    def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(ClusteringLayer, self).__init__(**kwargs)
        self.n_clusters = n_clusters
        self.alpha = alpha
        self.initial_weights = weights
        self.input_spec = InputSpec(ndim=2)

    def build(self, input_shape):
        assert len(input_shape) == 2
        input_dim = input_shape[1]
        self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim))
        self.clusters = self.add_weight((self.n_clusters, input_dim), initializer='glorot_uniform', name='clusters')
        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights
        self.built = True

    def call(self, inputs, **kwargs):
        """ student t-distribution, as same as used in t-SNE algorithm.
         Measure the similarity between embedded point z_i and centroid µ_j.
                 q_ij = 1/(1+dist(x_i, µ_j)^2), then normalize it.
                 q_ij can be interpreted as the probability of assigning sample i to cluster j.
                 (i.e., a soft assignment)
        Arguments:
            inputs: the variable containing data, shape=(n_samples, n_features)
        Return:
            q: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters)
        """
        q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
        q **= (self.alpha + 1.0) / 2.0
        q = K.transpose(K.transpose(q) / K.sum(q, axis=1)) # Make sure each sample's 10 values add up to 1.
        return q

    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) == 2
        return input_shape[0], self.n_clusters

    def get_config(self):
        config = {'n_clusters': self.n_clusters}
        base_config = super(ClusteringLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [0]:
def autoencoder(dims, act='relu', init='glorot_uniform'):
    """
    Fully connected auto-encoder model, symmetric.
    Arguments:
        dims: list of number of units in each layer of encoder. dims[0] is input dim, dims[-1] is units in hidden layer.
            The decoder is symmetric with encoder. So number of layers of the auto-encoder is 2*len(dims)-1
        act: activation, not applied to Input, Hidden and Output layers
    return:
        (ae_model, encoder_model), Model of autoencoder and model of encoder
    """
    n_stacks = len(dims) - 1
    # input
    input_img = Input(shape=(dims[0],), name='input')
    x = input_img
    # internal layers in encoder
    for i in range(n_stacks-1):
        x = Dense(dims[i + 1], activation=act, kernel_initializer=init, name='encoder_%d' % i)(x)

    # hidden layer
    encoded = Dense(dims[-1], kernel_initializer=init, name='encoder_%d' % (n_stacks - 1))(x)  # hidden layer, features are extracted from here

    x = encoded
    # internal layers in decoder
    for i in range(n_stacks-1, 0, -1):
        x = Dense(dims[i], activation=act, kernel_initializer=init, name='decoder_%d' % i)(x)

    # output
    x = Dense(dims[0], kernel_initializer=init, name='decoder_0')(x)
    decoded = x
    return Model(inputs=input_img, outputs=decoded, name='AE'), Model(inputs=input_img, outputs=encoded, name='encoder')

In [0]:
# computing an auxiliary target distribution
def target_distribution(q):
    weight = q ** 2 / q.sum(0)
    return (weight.T / weight.sum(1)).T

## Fully Connected DEC

In [0]:
from keras.initializers import VarianceScaling
from keras.optimizers import SGD, Adam

In [0]:
# max_count = max([np.max(tfidf_matrix[i]) for i in range(tfidf_matrix.shape[1])]) * 1.
# x = np.divide(tfidf_matrix, max_count)
# n_clusters = 8

max_count = max([np.max(tsne_data[i]) for i in tsne_data]) * 1.
x = np.divide(tsne_data, max_count)
n_clusters = 8

In [0]:
dims = [x.shape[-1], 500, 500, 2000, 10]
init = VarianceScaling(scale=1. / 3., mode='fan_in', distribution='uniform')
# pretrain_optimizer = SGD(lr=1, momentum=0.9)
pretrain_optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
pretrain_epochs = 300
batch_size = 256

In [0]:
autoencoder, encoder = autoencoder(dims, init=init)
clustering_layer = ClusteringLayer(n_clusters, name='clustering')(encoder.output)
model = Model(inputs=encoder.input, outputs=[clustering_layer, autoencoder.output])

In [0]:
model.summary()

In [0]:
autoencoder.compile(optimizer=pretrain_optimizer, loss='mse')
%time autoencoder.fit(x, x, batch_size=batch_size, epochs=pretrain_epochs) #, callbacks=cb)
# autoencoder.save_weights('./data_output/drug_ae_weights.h5')