# cluster a set of documents using Python

1. tokenizing
2. stemming (*based on stemming lib, results change)- reduce a word to its stem or root form
3. calculate cosine distance between each document = measure of similarity
4. cluster documents using the k-means algorithm
5. using multidimensional scaling to reduce dimensionality within the corpus
6. conduct a hierarchical clustering on the corpus using Ward clustering

References :
- https://datascienceschool.net/view-notebook/3e7aadbf88ed4f0d87a76f9ddc925d69/

In [2]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import psycopg2
# import db_conn
from IPython.display import display
from sklearn.manifold import TSNE

In [3]:
# conn = get_connection()

def get_article_tables(is_file):
    if is_file:
        return pd.read_csv('../data/titles_condition_by_t.tsv', sep='\t', header=None)
#         return pd.read_csv('../topic_modeling/best_files/dic_unigram_size_6000/mallet_top_sen.tsv', sep='\t')
    else:
        curs = conn.cursor()

        select_sql = """SELECT id, table_title, strip_tags(CONTENT) as content FROM article_tables order by id""" # limit 10000
        curs.execute(select_sql)
        return curs.fetchall()

In [4]:
train_data = get_article_tables(True)
# train_data = train_data[['id', 'Origin_Text']]
train_data.columns=['id', 'title']
# clean_content = [x['content'].lower() for x in train_data]

In [5]:
train_data.head()

Unnamed: 0,id,title
0,4106,Analysis of efficacy
1,4107,Comparisons of postoperative CA19-9 levels on ...
2,4108,Pattern of disease relapse
3,4109,Grade 1–5 adverse events with gemcitabine alon...
4,4112,Treatment with zoledronic acid


In [6]:
train_data.title = train_data.title.str.strip()
train_data['title'].replace('', np.nan, inplace=True)
print(train_data.isna().any())

id       False
title    False
dtype: bool


In [7]:
train_data.loc[train_data.title.isna()]
train_data.dropna(subset=['title'], inplace=True)
print(train_data.isna().any())

Unnamed: 0,id,title


id       False
title    False
dtype: bool


In [8]:
train_data = train_data.astype(str)

In [9]:
# rep = {'nbsp':'', 'table':'', 'legend':'', 'mg/dl':'', 'g/l':'', 'yrs':'year', '\n':' ', ';':'', 'kg/m2':'', 'n=':''}
rep = {'nbsp':'', 'table':'', 'legend':'', 'yrs':'year', '\n':' '}
# clean_content = [pattern.sub(lambda m: rep[re.escape(m.group(0))], x['content']) for x in train_data]
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
train_data.title = [pattern.sub(lambda m: rep[re.escape(m.group(0))], str(x)) for x in train_data.title]


In [10]:
train_data.title[:10]

0                                 Analysis of efficacy
1    Comparisons of postoperative CA19-9 levels on ...
2                           Pattern of disease relapse
3    Grade 1–5 adverse events with gemcitabine alon...
4                       Treatment with zoledronic acid
5                             Treatment with docetaxel
6    Treatments ever used at relapse, at the discre...
7    Worst adverse event  (grade)  reported over en...
8    Chemotherapy delivery and trial drug discontin...
9                                       Adverse events
Name: title, dtype: object

In [11]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import strip_numeric
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [12]:
np.random.seed(2018)

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/grace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
#stopwords
stemmer = SnowballStemmer('english')
STOP_WORDS = list(gensim.parsing.preprocessing.STOPWORDS)
STOP_WORDS.extend(['table', 'legend'])

In [14]:
# stemming
# -porter stemmer
# -lancaster stemmer
# -snowball stemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
#     deacc=True removes punctuations
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if token not in STOP_WORDS and len(token)>1:
#             result.append(lemmatize_stemming(strip_numeric(token)))
            result.append(lemmatize_stemming(token))
    return result

def preprocess_token_only(text):
    result = []
#     deacc=True removes punctuations
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if token not in STOP_WORDS and len(token)>1:
#             result.append(lemmatize_stemming(strip_numeric(token)))
            result.append(token)
    return result

stemmer = LancasterStemmer()
#tokenizing
def tokenize_and_stem(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    #filter tokens not containing letters
    filtered = []
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token)>2:
            filtered.append(token)
#     stems = [stemmer.stem(t, pos='v') for t in filtered]
    stems = [stemmer.stem(t) for t in filtered]
    return stems

def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered = []
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token)>2:
            filtered.append(token)
    return filtered

In [15]:
total_vocab_stemmed = []
total_vocab_tokenized = []

for i in train_data.title.tolist():
    all_stemmed = preprocess(i)
    total_vocab_stemmed.extend(all_stemmed)
    
    all_tokenized = preprocess_token_only(i)
    total_vocab_tokenized.extend(all_tokenized)

processed_docs = pd.DataFrame()
processed_docs = pd.concat([train_data.id, train_data.title.map(preprocess)], axis=1)

In [16]:
processed_docs[:10]

Unnamed: 0,id,title
0,4106,"[analys, eff]"
1,4107,"[comparison, postop, ca, level, surv, espac, c..."
2,4108,"[pattern, diseas, relaps]"
3,4109,"[grad, advers, ev, gemcitabin, gemcitabin, plu..."
4,4112,"[tre, zoledron, acid]"
5,4113,"[tre, docetaxel]"
6,4114,"[tre, relaps, discret, tre, clin]"
7,4115,"[worst, advers, ev, grad, report, entir, tim, ..."
8,4117,"[chemotherapy, delivery, tri, drug, discontinu]"
9,4118,"[advers, ev]"


In [17]:
#create dataframe with stemmed vocab and tokenized words (link)
vocab_frame = pd.DataFrame({'words':total_vocab_tokenized}, index=total_vocab_stemmed)
vocab_frame.drop_duplicates(inplace=True)
vocab_frame.head()

Unnamed: 0,words
analys,analysis
eff,efficacy
comparison,comparisons
postop,postoperative
ca,ca


## Tf-idf and document similarity

In [18]:
#frequency-inverse document frequencey(tf-idf) vectorize parameters and convert the document list into tf-idf matrix
# 1. count word occurrences by document
# 2. transform into a document-term matrix = term frequency matrix

#max_df = max frequency within the documents
#min_idf = if 5, the term would have to be in at least 5 of the documents to be considered, 0.2 = 20% of documents
#ngram_ranges

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, 
                                   max_features=10000, 
                                   min_df=0.01, 
                                   stop_words='english', 
                                   use_idf=True, 
                                   lowercase=True, 
                                   tokenizer=preprocess)
#                                    tokenizer=preprocess, ngram_range=(1,2))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(train_data.title)
print(tfidf_matrix.shape)

CPU times: user 9.62 s, sys: 73.7 ms, total: 9.69 s
Wall time: 9.72 s
(27960, 147)


In [19]:
terms = tfidf_vectorizer.get_feature_names()

In [20]:
terms[:10]

['accord',
 'act',
 'acut',
 'adjust',
 'advers',
 'ag',
 'analys',
 'angiograph',
 'artery',
 'assess']

In [21]:
%time tfidf_tsne_result = TSNE(learning_rate=300, init='pca')\
                    .fit_transform(np.array(tfidf_matrix.toarray()))
tfidf_vect = normali(tfidf_tsne_result, norm='l2')

CPU times: user 15min 31s, sys: 45.6 s, total: 16min 16s
Wall time: 16min 45s


NameError: name 'noralize' is not defined

In [None]:
#dist = cosine similarity of each document
from sklearn.metrics.pairwise import cosine_similarity

dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
dist

## HashingVectorizer, CountVectorizer

- 문서 집합에서 단어 토큰을 생성하고 각 단어의 수를 세어 BOW 인코딩한 벡터를 만든다.
- HashingVectorizer를 사용하면 해시 함수를 사용하여 단어에 대한 인덱스 번호를 생성하기 때문에 메모리 및 실행 시간을 줄일 수 있다.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

vect = CountVectorizer(max_df=0.8, \
                       min_df=0.01, \
                       max_features=10000,\
                       lowercase=True,\
                       tokenizer=preprocess,\
                       stop_words='english')
# vect = HashingVectorizer(n_features=10000,\
#                        lowercase=True,\
#                        tokenizer=preprocess,\
#                        stop_words='english')

# vect = CountVectorizer(max_df=0.8, min_df=0.02, ngram_range=(2,2))
count_matrix = vect.fit_transform(train_data.title)
vect.vocabulary_

In [1]:
%time count_tsne_result = TSNE(learning_rate=300, init='pca')\
                    .fit_transform(np.array(count_matrix.toarray()))
count_vect = noralize(count_tsne_result, norm='l2')

NameError: name 'TSNE' is not defined

NameError: name 'noralize' is not defined

In [None]:
#k-means- predetermined number of clusters
nums =[8]
from sklearn.cluster import KMeans
from __future__ import print_function

def run_kmeans(vect):
    for num in nums:
        print('cluster : %s' % str(num))
        num_clusters = num
        km = KMeans(n_clusters=num_clusters,\
                    random_state=0,\
                    init='random',\
                    algorithm='auto',\
                    max_iter=30000)
        %time km.fit(vect)
        clusters = km.labels_.tolist()

        documents = {'id':[x for x in processed_docs.id],
                    'content': train_data.title.tolist(),
                     'title': processed_docs.title.tolist(),
                    'cluster':clusters}

        clu_docu = pd.DataFrame(documents, index=[clusters], columns=['id','content','title','cluster'])

        print(clu_docu['cluster'].value_counts())

        #top words nearest to the cluster centroid
        print('Top terms per clusters')
        print()
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]

        for i in range(num_clusters):
            print('Cluster %d words:' % i, end='')

            for ind in order_centroids[i, :]:
                print('%s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
            print()

            print("Cluster %d titles:" % i, end='')
            adver_contents = [x for x in clu_docu.loc[i]['content'].tolist() if x.lower().find('advers')>=0]
            print('count of adverse included in content %s' % str(len(adver_contents)))

        return clu_docu

In [None]:
tfidf_clu = run_kmeans(tfidf_vect)

In [None]:
count_clu = run_kmeans(count_vect)

In [None]:
tfidf_clu.shape

In [None]:
count_clu.shape

In [None]:
count_clu[count_clu.cluster==tfidf_clu.cluster]

In [None]:
#visualize document clusters
import matplotlib.pyplot as plt