In [1]:
#cluster a set of documents using Python
#identify the latent structures within the synopses of the top 100 films

#1. tokenizing
#2. stemming (*based on stemming lib, results change)- reduce a word to its stem or root form
#3. calculate cosine distance between each document = measure of similarity
#4. cluster documents using the k-means algorithm
#5. using multidimensional scaling to reduce dimensionality within the corpus
#6. conduct a hierarchical clustering on the corpus using Ward clustering
#7. Latent Dirichlet Allocation(LDA)????

In [3]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import psycopg2
# import db_conn
from IPython.display import display

In [62]:
# conn = get_connection()

def get_article_tables(is_file):
    if is_file:
#         return pd.read_csv('../topic_modeling/titles_condition.tsv', sep='\t', header=None)
        return pd.read_csv('../topic_modeling/best_files/dic_unigram_size_6000/mallet_top_sen.tsv', sep='\t')
    else:
        curs = conn.cursor()

        select_sql = """SELECT id, table_title, strip_tags(CONTENT) as content FROM article_tables order by id""" # limit 10000
        curs.execute(select_sql)
        return curs.fetchall()

In [65]:
#sample data
# from sklearn.datasets import fetch_20newsgroups
# twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
# twenty_train.target_names
# from nltk.corpus import brown
# from keras.datasets import imdb
# (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000)


train_data = get_article_tables(True)
train_data = train_data[['id', 'Origin_Text']]
train_data.columns=['id', 'title']
# clean_content = [x['content'].lower() for x in train_data]

In [66]:
train_data.head()

Unnamed: 0,id,title
0,39402,Association of mortality from all causes with ...
1,39431,Female relative risk of death at ages 1574 yea...
2,32718,Unadjusted and Adjusted Association of Heart R...
3,39430,Male relative risk of death at ages 1574 years...
4,35795,Single nucleotide polymorphism (SNP) haploty...


In [67]:
rep = {'nbsp':'', 'table':'', 'legend':'', 'mg/dl':'', 'g/l':'', 'yrs':'year', '\n':' ', ';':'', 'kg/m2':'', 'n=':''}#, 'e.g', '(', ')'}

# clean_content = [pattern.sub(lambda m: rep[re.escape(m.group(0))], x['content']) for x in train_data]
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
clean_content = [pattern.sub(lambda m: rep[re.escape(m.group(0))], str(x)) for x in train_data.title]

In [68]:
clean_content[:10]

['Association of mortality from all causes with frequency of non-beverage alcohol drinking and mean volume ethanol consumed from beverages',
 'Female relative risk of death at ages 1574 years, by proxy-reported alcohol intake  (other drinkers vs reference drinkers, excluding never-drinkers)  and certified cause  (other causes vs control diseases)',
 'Unadjusted and Adjusted Association of Heart Rate Variability Variables With All-Cause Mortality, Cardiac Mortality, Sudden Cardiac Mortality, Sudden Cardiac Autopsy-Verified Mortality, Sudden Cardiac Mortality for Both Genders, Nonsudden Cardiac Mortality and Nonsudden Cardiac Mortality With Cerebrovascular Mortality  ',
 'Male relative risk of death at ages 1574 years, by proxy-reported alcohol intake  (other drinkers vs reference drinkers, excluding never-drinkers)  and certified cause  (other causes vs control diseases)',
 "Single nucleotide polymorphism  (SNP)  haplotype frequency estimates in patients with Crohn's disease with and wi

In [69]:
# re.sub('[^A-Za-z ]+', '', train_data[5])
# re.sub('[^A-Za-z0-9 ]+', '', train_data[5])

In [70]:
#stopwords
stopwords = nltk.corpus.stopwords.words('english')
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [71]:
# stemming
# -porter stemmer
# -lancaster stemmer
# -snowball stemmer
from nltk.stem.lancaster import LancasterStemmer
# from nltk.stem.snowball import SnowballStemmer
# stemmer = SnowballStemmer('english')
stemmer = LancasterStemmer()
#tokenizing
def tokenize_and_stem(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    #filter tokens not containing letters
    filtered = []
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token)>2:
            filtered.append(token)
#     stems = [stemmer.stem(t, pos='v') for t in filtered]
    stems = [stemmer.stem(t) for t in filtered]
    return stems

def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered = []
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token)>2:
            filtered.append(token)
    return filtered

In [72]:
total_vocab_stemmed = []
total_vocab_tokenized = []

for i in clean_content:
    all_stemmed = tokenize_and_stem(i)
    total_vocab_stemmed.extend(all_stemmed)
    
    all_tokenized = tokenize_only(i)
    total_vocab_tokenized.extend(all_tokenized)

In [73]:
total_vocab_tokenized[:10]

['association',
 'mortality',
 'from',
 'all',
 'causes',
 'with',
 'frequency',
 'non-beverage',
 'alcohol',
 'drinking']

In [74]:
#create dataframe with stemmed vocab and tokenized words (link)
vocab_frame = pd.DataFrame({'words':total_vocab_tokenized}, index=total_vocab_stemmed)
vocab_frame.drop_duplicates(inplace=True)
vocab_frame.head()

Unnamed: 0,words
assocy,association
mort,mortality
from,from
al,all
caus,causes


In [75]:
# len([brown.raw(__id) for __id in [_id for _id in brown.fileids()]])
# data_list = [brown.raw(__id) for __id in [_id for _id in brown.fileids()]]

In [76]:
#Tf-idf and document similarity
#frequency-inverse document frequencey(tf-idf) vectorize parameters and convert the document list into tf-idf matrix
# 1. count word occurrences by document
# 2. transform into a document-term matrix = term frequency matrix

#max_df = max frequency within the documents
#min_idf = if 5, the term would have to be in at least 5 of the documents to be considered, 0.2 = 20% of documents
#ngram_ranges

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, 
                                   max_features=200000, 
                                   min_df=0.01, 
                                   stop_words='english', 
                                   use_idf=True, 
                                   lowercase=True, 
                                   tokenizer=tokenize_and_stem, ngram_range=(1,2))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(clean_content)
print()
print(tfidf_matrix.shape)

CPU times: user 296 ms, sys: 3.62 ms, total: 300 ms
Wall time: 299 ms

(400, 510)


In [77]:
terms = tfidf_vectorizer.get_feature_names()

In [78]:
terms

['5-year',
 'acc',
 'accord',
 'act',
 'acut',
 'adjust',
 'adjust od',
 'admin',
 'admin paracetamol',
 'adult',
 'adult ag',
 'adult year',
 'advers',
 'advers ev',
 'aft',
 'ag',
 'ag group',
 'ag year',
 'age-adjusted',
 'age-standardised',
 'age-standardised dea',
 'al',
 'al caus',
 'al grad',
 'alcohol',
 'alcohol drink',
 'all-cause',
 'allel',
 'analys',
 'analys cox',
 'analys independ',
 'analys predict',
 'angiograph',
 'antibody',
 'antibody respons',
 'aort',
 'aort psg',
 'ar',
 'area',
 'art',
 'artery',
 'artery bypass',
 'assay',
 'assess',
 'assist',
 'assocy',
 'asthm',
 'atop',
 'atp',
 'atp immunog',
 'bas',
 'bef',
 'bef cur',
 'bev',
 'bir',
 'blood',
 'blood flow',
 'blood press',
 'bmi',
 'breast',
 'bypass',
 'cad',
 'candid',
 'candid heart',
 'captopril',
 'captopril mg/kg',
 'cardiac',
 'cardiomyopathy',
 'cardiovascul',
 'cardiovascul diseas',
 'cas',
 'categ',
 'caus',
 'cel',
 'cerv',
 'cha2ds2-vasc',
 'chang',
 'check',
 'check bef',
 'chemotherapy',
 

In [79]:
#dist = cosine similarity of each document
from sklearn.metrics.pairwise import cosine_similarity

dist = 1 - cosine_similarity(tfidf_matrix)

In [80]:
dist

array([[ 0.00000000e+00,  7.04848956e-01,  8.07491131e-01, ...,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 7.04848956e-01, -2.22044605e-16,  1.00000000e+00, ...,
         1.00000000e+00,  9.37638829e-01,  9.55212910e-01],
       [ 8.07491131e-01,  1.00000000e+00, -2.22044605e-16, ...,
         9.61542562e-01,  1.00000000e+00,  1.00000000e+00],
       ...,
       [ 1.00000000e+00,  1.00000000e+00,  9.61542562e-01, ...,
         0.00000000e+00,  9.06334136e-01,  5.14163635e-01],
       [ 1.00000000e+00,  9.37638829e-01,  1.00000000e+00, ...,
         9.06334136e-01,  0.00000000e+00,  8.48980280e-01],
       [ 1.00000000e+00,  9.55212910e-01,  1.00000000e+00, ...,
         5.14163635e-01,  8.48980280e-01, -2.22044605e-16]])

In [81]:
#k-means- predetermined number of clusters
# nums=[3, 4, 5]
nums =[8]
from sklearn.cluster import KMeans
from __future__ import print_function

for num in nums:
    print('cluster : %s' % str(num))
    num_clusters = num
    km = KMeans(n_clusters=num_clusters)
    %time km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()

    documents = {'id':[x for x in train_data.id],
                'content':clean_content,
                 'title': [str(x) for x in train_data.title],
                'cluster':clusters}

    clu_docu = pd.DataFrame(documents, index=[clusters], columns=['id','content','title','cluster'])

    print(clu_docu['cluster'].value_counts())

    #top words nearest to the cluster centroid
    print('Top terms per clusters')
    print()
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    for i in range(num_clusters):
        print('Cluster %d words:' % i, end='')

        for ind in order_centroids[i, :]:
            print('%s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
        print()

        print("Cluster %d titles:" % i, end='')
        title_list = clu_docu.loc[i]['title'].values.tolist()
        adver_titles = [x for x in title_list if x.lower().find('adverse')>=0]
        adver_contents = [x for x in clu_docu.loc[i]['content'].values.tolist() if x.lower().find('adverse')>=0]
        print('count of adverse included in title %s' % str(len(adver_titles)))
        print('count of adverse included in content %s' % str(len(adver_contents)))
        print(adver_titles)

cluster : 8
CPU times: user 487 ms, sys: 2.03 ms, total: 489 ms
Wall time: 488 ms


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

0    96
2    59
6    51
7    50
3    48
1    46
4    30
5    20
Name: cluster, dtype: int64
Top terms per clusters

Cluster 0 words:ages,rate,country,years,number,health,mortality,diagnosed,changes,prevalence,sex,adulthood,childhood,age-standardised,all,total,population,combined,antibody,registry,women,ages,minor,ideation,death,frequency,malignant,birth,pancreas,deaths,group,sample,breast,globally,mg/kg,association,alcohol,expressed,artificial,polymorphism,causes,levels,survival,method,periods,maternal,beverages,stratified,developing,during,daly,trends,lesions,disorders,depression,prostate,alleles,discrimination,according,relative,expected,ischemic,incidence,individuals,continent,continent,years,people,newborn,net,time,mortality,net,millions,alcohol,smokers,pregnant,cytology,childhood,percentage,children,mg/kg,mg/kg/min,study,neonatal,death,life,drinking,costa,positive,colon,colon,rectum,sex,categories,covariates,analysis,maternal,typical,estimates,gbd,5-year,national,age-standardised,

Cluster 2 words:coronary,wall,angiographic,left,artery,ventricular,aortic,coronary,perfusion,coronary,segments,tomography,functional,patients,computer,computer,analysis,left,rt-pcr,comparison,magnetic,magnetic,resonance,according,stenosis,versus,motion,imaging,results,myocardial,infarction,emission,wall,index,versus,study,myocardial,territory,emission,fraction,artery,bypass,data,detection,mean,basis,ejection,ejection,normal,group,weights,dysfunction,sensitivity,protocols,severity,association,psg,concomitant,areas,psg,aortic,weights,diagnostic,special,systolic,cmr,accuracy,stent,bmi,concomitant,phase,definite,higher,during,volume,standard,hemodynamic,gradient,predictors,list,ages,primary,cardiac,status,value,classification,hospitalization,heart,genders,late,flow,clinical,invasive,death,acute,different,echocardiography,time,hazard,transplantation,cases,diseases,end,probability,significant,control,adjusted,after,scores,assessing,major,cardiovascular,sensitivity,trial,hazard,month,mortalit

Cluster 4 words:human,virulent,comparing,special,sensitivity,sensitivity,plasma,assay,cells,mouse,rabbit,presence,rna,responses,after,infection,testing,insulin,detection,low,western,study,typical,segments,value,diagnostic,antibody,basis,different,plus,genotype,given,reference,ventricular,dysfunction,severity,combined,cervical,rt-pcr,cytology,prognostic,ejection,ejection,left,fraction,ventricular,parameters,patients,left,sample,clinical,therapy,treatment,conventional,cases,ages,medically,standard,during,levels,grade,changes,serum,contained,assessing,genes,other,before,producing,national,months,rate,net,artery,primary,expressed,maximal,weights,increasing,discrimination,determine,late,systolic,genders,frequency,effect,set,index,total,exercise,positive,stages,scores,ratios,number,versus,more,predictors,relative,analysis,developing,des,double-blind,dose,either,drinking,doppler,developing,either,during,dogs,dtpa-hbv-ipv/hib,drug,diagnosed,diagnosed,echocardiography,disorders,diseases,drug-re

Cluster 6 words:risk,diseases,stroke,risk,heart,factors,infarction,patients,myocardial,death,scores,myocardial,relative,kidney,chronic,cardiovascular,haplotype,cardiovascular,chronic,kidney,clinical,status,heart,failure,transplantation,variability,relative,association,heart,cha2ds2-vasc,causes,drinking,other,genotype,ratios,stroke,ischemic,congestive,congestive,female,according,des,ischaemic,ischaemic,adjusted,estimates,hazard,coronary,expressed,stratified,stent,presence,population,study,categories,outcomes,coronary,years,versus,snp,polymorphism,frequency,ages,nucleotide,nucleotide,single,reference,registry,single,models,scientific,u.s.,scientific,transplantation,unos,heart,registry,candidates,u.s.,candidates,unos,status,definite,polymorphism,all,recipients,hazard,stroke,syndrome,mortality,comparison,intake,control,days,metabolic,comparing,alcohol,basis,drug,life,thrombosis,stent,cad,fraction,western,breast,increasing,list,data,before,maternal,after,stages,measured,criteria,angiographi

In [83]:
km.labels_

array([0, 6, 0, 6, 6, 6, 6, 0, 0, 6, 0, 0, 0, 6, 0, 6, 6, 6, 6, 0, 0, 0,
       0, 0, 2, 6, 6, 0, 6, 6, 2, 0, 0, 0, 0, 6, 4, 2, 6, 7, 0, 6, 6, 0,
       5, 0, 6, 6, 6, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 6, 2, 2, 2, 2,
       6, 2, 2, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 2, 2, 2,
       2, 2, 6, 2, 6, 2, 2, 2, 2, 4, 2, 2, 6, 2, 2, 2, 6, 2, 2, 2, 2, 2,
       4, 2, 4, 4, 4, 4, 0, 7, 7, 7, 4, 4, 0, 2, 4, 0, 7, 7, 4, 7, 0, 4,
       4, 2, 4, 7, 0, 7, 0, 0, 0, 4, 4, 2, 7, 0, 4, 4, 4, 0, 0, 4, 4, 0,
       4, 0, 4, 4, 4, 4, 2, 4, 5, 6, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 6, 5,
       6, 5, 7, 5, 6, 7, 5, 0, 0, 6, 6, 6, 6, 5, 5, 5, 0, 6, 6, 6, 6, 2,
       7, 7, 5, 5, 7, 5, 6, 2, 6, 6, 7, 0, 5, 7, 0,

In [None]:
# from sklearn.externals import joblib

# joblib.dump(km, 'doc_cluster.pkl')
# joblib.load('doc_cluster.pkl)

In [None]:
#multidimensional scaling
import os
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

mds = MDS(n_components=2, dissimilarity='precomputed', random_state=1)
pos = mds.fit_transform(dist)


In [None]:
#visualize document clusters
