# cluster a set of documents using Python

1. tokenizing
2. stemming (*based on stemming lib, results change)- reduce a word to its stem or root form
3. calculate cosine distance between each document = measure of similarity
4. cluster documents using the k-means algorithm
5. using multidimensional scaling to reduce dimensionality within the corpus
6. conduct a hierarchical clustering on the corpus using Ward clustering

References :
- https://datascienceschool.net/view-notebook/3e7aadbf88ed4f0d87a76f9ddc925d69/
- https://medium.com/@vegi/visualizing-higher-dimensional-data-using-t-sne-on-tensorboard-7dbf22682cf2

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import psycopg2
# import db_conn
from IPython.display import display
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize

In [2]:
# conn = get_connection()

def get_article_tables(is_file):
    if is_file:
        return pd.read_csv('../data/titles_condition_by_t.tsv', sep='\t', header=None)
#         return pd.read_csv('../topic_modeling/best_files/dic_unigram_size_6000/mallet_top_sen.tsv', sep='\t')
    else:
        curs = conn.cursor()

        select_sql = """SELECT id, table_title, strip_tags(CONTENT) as content FROM article_tables order by id""" # limit 10000
        curs.execute(select_sql)
        return curs.fetchall()

In [3]:
train_data = get_article_tables(True)
# train_data = train_data[['id', 'Origin_Text']]
train_data.columns=['id', 'title']
# clean_content = [x['content'].lower() for x in train_data]

In [4]:
train_data.head()

Unnamed: 0,id,title
0,4106,Analysis of efficacy
1,4107,Comparisons of postoperative CA19-9 levels on ...
2,4108,Pattern of disease relapse
3,4109,Grade 1–5 adverse events with gemcitabine alon...
4,4112,Treatment with zoledronic acid


In [5]:
train_data.title = train_data.title.str.strip()
train_data['title'].replace('', np.nan, inplace=True)
print(train_data.isna().any())

id       False
title    False
dtype: bool


In [6]:
train_data.loc[train_data.title.isna()]
train_data.dropna(subset=['title'], inplace=True)
print(train_data.isna().any())

Unnamed: 0,id,title


id       False
title    False
dtype: bool


In [7]:
train_data = train_data.astype(str)

In [8]:
# rep = {'nbsp':'', 'table':'', 'legend':'', 'mg/dl':'', 'g/l':'', 'yrs':'year', '\n':' ', ';':'', 'kg/m2':'', 'n=':''}
rep = {'nbsp':'', 'table':'', 'legend':'', 'yrs':'year', '\n':' '}
# clean_content = [pattern.sub(lambda m: rep[re.escape(m.group(0))], x['content']) for x in train_data]
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
train_data.title = [pattern.sub(lambda m: rep[re.escape(m.group(0))], str(x)) for x in train_data.title]


In [9]:
train_data.title[:10]

0                                 Analysis of efficacy
1    Comparisons of postoperative CA19-9 levels on ...
2                           Pattern of disease relapse
3    Grade 1–5 adverse events with gemcitabine alon...
4                       Treatment with zoledronic acid
5                             Treatment with docetaxel
6    Treatments ever used at relapse, at the discre...
7    Worst adverse event  (grade)  reported over en...
8    Chemotherapy delivery and trial drug discontin...
9                                       Adverse events
Name: title, dtype: object

In [10]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import strip_numeric
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [11]:
np.random.seed(2018)

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/grace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
#stopwords
stemmer = SnowballStemmer('english')
STOP_WORDS = list(gensim.parsing.preprocessing.STOPWORDS)
STOP_WORDS.extend(['table', 'legend'])

In [13]:
# stemming
# -porter stemmer
# -lancaster stemmer
# -snowball stemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
#     deacc=True removes punctuations
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if token not in STOP_WORDS and len(token)>1:
#             result.append(lemmatize_stemming(strip_numeric(token)))
            result.append(lemmatize_stemming(token))
    return result

def preprocess_token_only(text):
    result = []
#     deacc=True removes punctuations
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if token not in STOP_WORDS and len(token)>1:
#             result.append(lemmatize_stemming(strip_numeric(token)))
            result.append(token)
    return result

stemmer = LancasterStemmer()
#tokenizing
def tokenize_and_stem(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    #filter tokens not containing letters
    filtered = []
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token)>2:
            filtered.append(token)
#     stems = [stemmer.stem(t, pos='v') for t in filtered]
    stems = [stemmer.stem(t) for t in filtered]
    return stems

def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered = []
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token)>2:
            filtered.append(token)
    return filtered

In [14]:
total_vocab_stemmed = []
total_vocab_tokenized = []

for i in train_data.title.tolist():
    all_stemmed = preprocess(i)
    total_vocab_stemmed.extend(all_stemmed)
    
    all_tokenized = preprocess_token_only(i)
    total_vocab_tokenized.extend(all_tokenized)

processed_docs = pd.DataFrame()
processed_docs = pd.concat([train_data.id, train_data.title.map(preprocess)], axis=1)

In [15]:
processed_docs[:10]

Unnamed: 0,id,title
0,4106,"[analys, eff]"
1,4107,"[comparison, postop, ca, level, surv, espac, c..."
2,4108,"[pattern, diseas, relaps]"
3,4109,"[grad, advers, ev, gemcitabin, gemcitabin, plu..."
4,4112,"[tre, zoledron, acid]"
5,4113,"[tre, docetaxel]"
6,4114,"[tre, relaps, discret, tre, clin]"
7,4115,"[worst, advers, ev, grad, report, entir, tim, ..."
8,4117,"[chemotherapy, delivery, tri, drug, discontinu]"
9,4118,"[advers, ev]"


In [16]:
#create dataframe with stemmed vocab and tokenized words (link)
vocab_frame = pd.DataFrame({'words':total_vocab_tokenized}, index=total_vocab_stemmed)
vocab_frame.drop_duplicates(inplace=True)
vocab_frame.head()

Unnamed: 0,words
analys,analysis
eff,efficacy
comparison,comparisons
postop,postoperative
ca,ca


## Tf-idf and document similarity

In [17]:
#frequency-inverse document frequencey(tf-idf) vectorize parameters and convert the document list into tf-idf matrix
# 1. count word occurrences by document
# 2. transform into a document-term matrix = term frequency matrix

#max_df = max frequency within the documents
#min_idf = if 5, the term would have to be in at least 5 of the documents to be considered, 0.2 = 20% of documents
#ngram_ranges

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, 
                                   max_features=10000, 
                                   min_df=0.01, 
                                   stop_words='english', 
                                   use_idf=True, 
                                   lowercase=True, 
                                   tokenizer=preprocess)
#                                    tokenizer=preprocess, ngram_range=(1,2))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(train_data.title)
print(tfidf_matrix.shape)

CPU times: user 10 s, sys: 85.3 ms, total: 10.1 s
Wall time: 10.2 s
(27960, 147)


In [18]:
terms = tfidf_vectorizer.get_feature_names()

In [19]:
terms[:10]

['accord',
 'act',
 'acut',
 'adjust',
 'advers',
 'ag',
 'analys',
 'angiograph',
 'artery',
 'assess']

Ref : http://www.pbarrett.net/techpapers/euclid.pdf
Normalize :

The problem of raw distance coefficient is that it has no obvious bound value for the maximum distance.
Basically you don't know from its size whether a coefficient indicates a small or large distance.

In [20]:
%time tfidf_tsne_result = TSNE(learning_rate=300, init='pca', \
                               n_iter=250, random_state=0)\
                    .fit_transform(np.array(tfidf_matrix.toarray()))
tfidf_vect = normalize(tfidf_tsne_result, norm='l2')

CPU times: user 8min 57s, sys: 23.8 s, total: 9min 21s
Wall time: 13min 22s


### Visualization with Tensorboard

In [21]:
import os
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [22]:
tf_data = tf.Variable(tfidf_tsne_result)
with tf.Session() as sess:
    saver = tf.train.Saver([tf_data])
    sess.run(tf_data.initializer)
#     saver.save(sess, os.path.join(LOG_DIR, 'tf_data.ckpt'))
    config = projector.ProjectorConfig()
    
    embedding = config.embeddings.add()
    embedding.tensor_name = tf_data.name
    
#     embedding.metadata_path = metadata
#     projector.visualize_embeddings(tf.summary.FileWriter(LOG_DIR), config)

In [23]:
#dist = cosine similarity of each document
from sklearn.metrics.pairwise import cosine_similarity

dist = 1 - cosine_similarity(tfidf_matrix)

In [24]:
dist

array([[ 0.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         8.53883514e-01,  1.00000000e+00,  1.00000000e+00],
       [ 1.00000000e+00, -2.22044605e-16,  1.00000000e+00, ...,
         5.88511646e-01,  1.00000000e+00,  1.00000000e+00],
       [ 1.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       ...,
       [ 8.53883514e-01,  5.88511646e-01,  1.00000000e+00, ...,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         1.00000000e+00, -2.22044605e-16,  1.00000000e+00],
       [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00, -2.22044605e-16]])

## HashingVectorizer, CountVectorizer

- 문서 집합에서 단어 토큰을 생성하고 각 단어의 수를 세어 BOW 인코딩한 벡터를 만든다.
- HashingVectorizer를 사용하면 해시 함수를 사용하여 단어에 대한 인덱스 번호를 생성하기 때문에 메모리 및 실행 시간을 줄일 수 있다.

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

vect = CountVectorizer(max_df=0.8, \
                       min_df=0.01, \
                       max_features=10000,\
                       lowercase=True,\
                       tokenizer=preprocess,\
                       stop_words='english')
# vect = HashingVectorizer(n_features=10000,\
#                        lowercase=True,\
#                        tokenizer=preprocess,\
#                        stop_words='english')

# vect = CountVectorizer(max_df=0.8, min_df=0.02, ngram_range=(2,2))
count_matrix = vect.fit_transform(train_data.title)
vect.vocabulary_

{'accord': 0,
 'act': 1,
 'acut': 2,
 'adjust': 3,
 'advers': 4,
 'ag': 5,
 'analys': 6,
 'angiograph': 7,
 'artery': 8,
 'assess': 9,
 'assocy': 10,
 'bas': 11,
 'blood': 12,
 'cardiac': 13,
 'cardiovascul': 14,
 'cas': 15,
 'categ': 16,
 'caus': 17,
 'chang': 18,
 'childr': 19,
 'class': 20,
 'clin': 21,
 'cohort': 22,
 'combin': 23,
 'comp': 24,
 'comparison': 25,
 'comply': 26,
 'cont': 27,
 'control': 28,
 'coron': 29,
 'correl': 30,
 'country': 31,
 'cox': 32,
 'dat': 33,
 'day': 34,
 'dea': 35,
 'death': 36,
 'diagnos': 37,
 'diff': 38,
 'diseas': 39,
 'distribut': 40,
 'dos': 41,
 'drug': 42,
 'eff': 43,
 'effect': 44,
 'end': 45,
 'endpoint': 46,
 'estim': 47,
 'ev': 48,
 'exerc': 49,
 'fact': 50,
 'fail': 51,
 'flow': 52,
 'follow': 53,
 'frequ': 54,
 'funct': 55,
 'gen': 56,
 'group': 57,
 'hazard': 58,
 'heal': 59,
 'heart': 60,
 'hemodynam': 61,
 'high': 62,
 'hospit': 63,
 'incid': 64,
 'ind': 65,
 'independ': 66,
 'index': 67,
 'individ': 68,
 'infarct': 69,
 'infect': 7

In [26]:
%time count_tsne_result = TSNE(learning_rate=300, init='pca')\
                    .fit_transform(np.array(count_matrix.toarray()))
count_vect = normalize(count_tsne_result, norm='l2')

CPU times: user 20min 18s, sys: 1min 44s, total: 22min 2s
Wall time: 2h 39min 15s


In [42]:
#k-means- predetermined number of clusters
nums =[8]
from sklearn.cluster import KMeans
from __future__ import print_function

def run_kmeans(vect):
#     for num in nums:
#         print('cluster : %s' % str(num))
    num_clusters = 8
    km = KMeans(n_clusters=num_clusters,\
                random_state=0,\
                init='random',\
                algorithm='auto',\
                max_iter=30000)
    %time km.fit(vect)
    clusters = km.labels_.tolist()

    documents = {'id':[x for x in processed_docs.id],
                'content': train_data.title.tolist(),
                 'title': processed_docs.title.tolist(),
                'cluster':clusters}

    clu_docu = pd.DataFrame(documents, index=[clusters], columns=['id','content','title','cluster'])

    print(clu_docu['cluster'].value_counts())

    #top words nearest to the cluster centroid
    print('Top terms per clusters')
    print()
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    for i in range(num_clusters):
        print('Cluster %d words:' % i, end='')

        for ind in order_centroids[i, :]:
            print('%s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
        print()

        print("Cluster %d titles:" % i, end='')
        adver_contents = [x for x in clu_docu.loc[i]['content'].tolist() if x.lower().find('advers')>=0]
        print('count of adverse included in content %s' % str(len(adver_contents)))

    return km, clu_docu, km.cluster_centers_

In [32]:
km, tfidf_clu, tfidf_centers = run_kmeans(tfidf_vect)

cluster : 8
CPU times: user 517 ms, sys: 21.3 ms, total: 538 ms
Wall time: 553 ms
2    6515
7    4416
1    3628
0    3268
4    3248
5    2871
3    2696
6    1318
Name: cluster, dtype: int64
Top terms per clusters

Cluster 0 words:activation,according,
Cluster 0 titles:count of adverse included in content 21
Cluster 1 words:according,activation,
Cluster 1 titles:count of adverse included in content 2
Cluster 2 words:activation,according,
Cluster 2 titles:count of adverse included in content 33
Cluster 3 words:according,activation,
Cluster 3 titles:count of adverse included in content 7
Cluster 4 words:activation,according,
Cluster 4 titles:count of adverse included in content 3
Cluster 5 words:according,activation,
Cluster 5 titles:count of adverse included in content 3
Cluster 6 words:activation,according,
Cluster 6 titles:count of adverse included in content 34
Cluster 7 words:according,activation,
Cluster 7 titles:count of adverse included in content 1094


In [45]:
km, tfidf_clu, tfidf_centers = run_kmeans(tfidf_matrix)
import pickle

output = open('../data/output/kmeans.pkl', 'wb')
pickle.dump({'topic_model': km}, output)
output.close()

CPU times: user 18.6 s, sys: 53.2 ms, total: 18.7 s
Wall time: 18.7 s
1    15185
3     3020
2     2096
7     2059
4     1730
0     1696
6     1602
5      572
Name: cluster, dtype: int64
Top terms per clusters

Cluster 0 words:treatment,events,patients,adverse,effectiveness,population,analysis,groups,study,according,weeks,related,days,response,rate,months,safety,clinician,period,year,follow,change,risk,time,trials,incidence,number,drug,randomised,outcomes,reported,failure,hospitalization,states,death,medication,different,primary,mortality,summary,endpoints,efficacy,comparisons,data,ratio,versus,association,disease,infarction,comparative,myocardial,severely,cause,results,control,end,mean,overall,dose,parameters,type,heart,non,survival,measures,acute,estimated,therapy,coronial,subgroups,assessed,factors,frequencies,stratified,prevalences,total,adjusted,blood,concentrations,complications,post,score,second,variable,levels,month,hemodynamic,infection,angiographic,function,cohorts,specific,ac

Cluster 6 words:groups,treatment,patients,study,age,control,comparisons,risk,according,different,mortality,rate,events,outcomes,clinician,data,follow,mean,change,related,results,number,coronial,year,cause,effectiveness,variable,incidence,time,randomised,death,measures,score,analysis,ratio,parameters,states,predicted,hospitalization,hemodynamic,adverse,medication,days,value,comparative,months,sex,disease,exercise,factors,adjusted,distribution,dose,population,procedures,heart,index,function,period,myocardial,prevalences,trials,deaths,type,angiographic,cardiac,primary,test,model,high,blood,concentrations,ventricular,non,health,specific,women,association,weeks,total,survival,left,month,subjects,frequencies,diagnosed,flow,end,summary,odds,complications,regions,artery,levels,infarction,drug,country,points,intervals,versus,second,response,categories,proportional,therapy,pressure,endpoints,estimated,participants,activation,overall,presenting,post,severely,combined,case,reported,hazards,stratif

In [46]:
km.cluster_centers_.shape

(8, 147)