In [122]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pandas as pd
import json
import nltk
import unicodedata
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz
import operator

nltk.download('punkt') # for tokenize
nltk.download('wordnet') # for stemming & lemmertize

[nltk_data] Downloading package punkt to /Users/hatruong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hatruong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [114]:
def load_dict_from_json(file_path):
    """
    Load content from file then parse it to dictionary

    Args:
        file_path (TYPE): Description

    Returns:
        TYPE: Description
    """
    data = {}
    with open(file_path, 'r') as file:
        text = file.read()
        temp = json.loads(text)
        data = temp
    return data


def load_concept_definition(file_path):
    """Summary

    Args:
        file_path (TYPE): Description

    Returns:
        TYPE: Description

    No Longer Raises:
        e: Description
    """
    concepts = dict()
    data = load_dict_from_json(file_path)
    for f in data['definition']:
        conceptid = f['conceptid']
        concepts[conceptid] = dict()
        concepts[conceptid]['type'] = f['type']
        concepts[conceptid]['min_value'] = f['min_value']
        concepts[conceptid]['max_value'] = f['max_value']
        concepts[conceptid]['multiply'] = f['multiply']
        concepts[conceptid]['data'] = dict()
        for v in f['data']:
            concepts[conceptid]['data'][v['value']] = v['id']

        concepts[conceptid]['segments'] = dict()
        for s in f['segments']:
            concepts[conceptid]['segments'][s['value']] = s['id']

        concepts[conceptid]['hashmaps'] = dict()
        for h in f['hashmaps']:
            concepts[conceptid]['hashmaps'][h['value']] = h['hash']

    item2concept = dict()
    for c in data['item2concept']:
        item2concept[c['itemid']] = c['conceptid']

    return concepts, item2concept

def load_d_item(d_items_fullpath):
    df = pd.read_csv(d_items_fullpath)
    item2label = dict()
    for index, row in df.iterrows():
        item2label[row['itemid']] = row['label']
    return item2label
    

In [115]:
d_items_fullpath = '../data/d_items.csv'
item2label = load_d_item(d_items_fullpath)

In [116]:
concept_dir = '../data'
export_dir = '../data/webpages'
CONCEPT_WEBPAGES_FILE_NAME = 'concept_webpage.csv'
CONCEPT_DEFINITION_FILENAME = 'concept_definition.json'

# load crawled concept definition
concept_fullpath = os.path.join(
        concept_dir, CONCEPT_DEFINITION_FILENAME)
concept_definitions, _ = load_concept_definition(concept_fullpath)
print('Total concepts: %s' % len(concept_definitions))
    
# load crawled concept webpages
concept_webpage_fullpath = os.path.join(
        concept_dir, CONCEPT_WEBPAGES_FILE_NAME)

concept_webpage_dict = dict()
df = pd.read_csv(concept_webpage_fullpath)
for item in df.to_dict('records'):
    concept_webpage_dict[item['conceptid']] = item['encrypted_urls'].split(',')

print('Already crawled %s concepts' % len(concept_webpage_dict))

Total concepts: 6380
Already crawled 74 concepts


In [77]:
# number of carevue concepts
nb_cv_concept = len(
    [idx for idx in concept_webpage_dict.keys() if idx <= 220000])
print('Number of carevue concepts: %s' % nb_cv_concept)

nb_clusters = nb_cv_concept
if nb_cv_concept < len(concept_webpage_dict) / 2:
    nb_clusters = len(concept_webpage_dict) - nb_cv_concept
print('Number of nb_clusters: %s' % nb_clusters)

Number of carevue concepts: 50
Number of nb_clusters: 50


In [78]:
# load data
DATA_DIR = '../data/webpages'
filenames = [f for f in os.listdir(DATA_DIR) if os.path.isfile(os.path.join(DATA_DIR, f))]
print('Total files: %s' % len(filenames))
# datafiles[0]
data = dict()
for fname in filenames:
    with open(os.path.join(DATA_DIR, fname), 'r') as f:
        content = f.read()
        data[fname] = content

Total files: 3204


In [79]:
from enum import Enum

class Replacement(Enum):

    """Define common constants

    Attributes:
        CURRENCY (str): Description
        DATETIME (str): Description
        EMAIL (str): Description
        EMOJI_NEG (str): Description
        EMOJI_POS (str): Description
        NUMBER (str): Description
        PHONE (str): Description
        URL (str): Description

    """
    EMAIL = ' '
    URL = ' '
    NUMBER = ' '
    PHONE = ' '
    CURRENCY = ' '
    DATETIME = ' '
    
def handle_url(text):
    """Summary

    Args:
        text (TYPE): Description

    Returns:
        TYPE: Description
    """
    text = re.sub(r'http\S+', Replacement.URL.value, text)
    return text


def handle_email(text):
    """Summary

    Args:
        text (TYPE): Description

    Returns:
        TYPE: Description
    """
    return re.sub(r'(\w+@\w+)', Replacement.EMAIL.value, text)


def handle_numbers(text):
    """Summary

    Args:
        text (TYPE): Description

    Returns:
        TYPE: Description
    """
    # normal numbers
    text = re.sub(r'^\d+\s|\s\d+\s|\s\d+$', Replacement.NUMBER.value, text)
    text = re.sub(r'\b[\d.\/,]+', Replacement.NUMBER.value, text)
    return text


def handle_phone(text):
    """
    Handle cases:
            XX XXX XXX
            XXX XXX XXX
            XXXXXXXXX
        delimiter: whitespace OR - OR empty

    Args:
        text (TYPE): Description

    Returns:
        TYPE: Description
    """
    return re.sub(r'([\+\s]*\d{2,}[-\s.]?\d{3,4}[-\s.]?\d{3,4})',
                  Replacement.PHONE.value, text)

def remove_non_alphabet(text):
    """

    Args:
        text (TYPE): Description

    Returns:
        TYPE: Description
    """
    text = re.sub(
        r'[^a-zA-Z]', ' ', text
    )
    return text

def handle_datetime(text):
    """
    Handle cases: MM/YYYY, DD/MM/YYYY, DD/MM
    delimiters: /.-

    Args:
        text (TYPE): Description

    Returns:
        TYPE: Description
    """
    # MM/YYYY
    group_1 = r'(\d{1,2}[-./]\d{4})'

    # DD/MM or DD/MM/YYYY
    group_2 = r'(\d{1,2}[-./]\d{1,2}([-./]\d{4})?)'

    # 09h56 OR 12h
    group_3 = r'(\d{1,2}(h|H)(\d{1,2}(min|mins)?)?)'
    return re.sub(r'(' + group_1 + '|' + group_2 + '|' + group_3 + ')',
                  Replacement.DATETIME.value, text)

def preprocess_sentence(text):
    """
    Args:
        text (TYPE): Description

    Returns:
        TYPE: Description
    """
    funcs = [handle_url, handle_phone, handle_datetime, handle_numbers, handle_email,]
    for f in funcs:
#         logger.debug('preprocess %s' % str(f))
        text = f(text)

    return text

def contain_punctuation(word):
    for p in string.punctuation:
        if p in word:
            return True
    return False

def preprocess_document(document):
    """
    """
    document = document.replace('\\n', '\n').strip('"b')
    document = unicodedata.normalize("NFKC", document)

    PUNCTUATIONS = string.punctuation + ' '
    sents = [s for s in nltk.sent_tokenize(document) if len(s) > 1]
#     print('\nstep1: \t', sents)

    # preprocess each sentence
    sents = [preprocess_sentence(s) for s in sents]
#     print('\nstep2: \t', sents)

    # filter again too short sentence
    sents = [s.strip() for s in sents if len(s.strip()) > 1]
#     print('\nstep3: \t', sents)

    # tokenize words
    words = [[w.strip(PUNCTUATIONS) for w in nltk.word_tokenize(s) if contain_punctuation(w) is False]
             for s in sents]
    
    # stemming & lemming
    stemmer = SnowballStemmer("english")
    lemmatizer = WordNetLemmatizer()
    
    words = [[stemmer.stem(w) for w in s] for s in words]
    words = [[lemmatizer.lemmatize(w) for w in s] for s in words]
    
    # concat words to sentences
    sents = [' '.join([w for w in s]) for s in words]
    processed_document = ' '.join([s if s.endswith(tuple(string.punctuation))
             else s + ' .' for s in sents])
#     print('\nstep4: \t', sents)
    return processed_document

In [80]:
doc = data[filenames[0]]
doc

'b"Diastolic | definition of diastolic by Medical dictionary\\nhttps://medical-dictionary.thefreedictionary.com/diastolic\\npertaining to diastole, or the blood pressure at the instant of maximum cardiac relaxation.\\ndi\\xc2\\xb7a\\xc2\\xb7stol\\xc2\\xb7ic\\nRelating to diastole.\\ndiastolic\\nPertaining to DIASTOLE . The diastolic blood pressure is the pressure during diastole and is the lower of the two figures measured. The peak pressure is called the SYSTOLIC pressure.\\nDiastolic\\nThe phase of blood circulation in which the heart\'s pumping chambers (ventricles) are being filled with blood. During this phase, the ventricles are at their most relaxed, and the pressure against the walls of the arteries is at its lowest.\\nWant to thank TFD for its existence? Tell a friend about us , add a link to this page, or visit the webmaster\'s page for free fun content .\\nLink to this page:\\nReferences in periodicals archive ?\\nThe prolongation of IVRT more than 100 msec is a significant 

In [82]:
# preprocess (sample for the first document)
# replace \n = line break
new_doc = doc.replace('\\n', '\n').strip('"b')
new_doc = unicodedata.normalize("NFKC", new_doc)
new_doc = preprocess_document(new_doc)
print(new_doc)

diastol definit of diastol by medic dictionari pertain to diastol or the blood pressur at the instant of maximum cardiac relax . relat to diastol . diastol pertain to diastol . the diastol blood pressur is the pressur dure diastol and is the lower of the two figur measur . the peak pressur is call the systol pressur . diastol the phase of blood circul in which the heart pump chamber ventricl are be fill with blood . dure this phase the ventricl are at their most relax and the pressur against the wall of the arteri is at it lowest . want to thank tfd for it exist . tell a friend about u add a link to this page or visit the webmast page for free fun content . link to this page refer in period archiv . the prolong of ivrt more than msec is a signific indic of earli lv diastol dysfunct . left atrium volum a a surrog marker of left ventricular diastol dysfunct the systol and diastol size of the left ventricl volum of the left ventricl index of the volum of the left ventricl dimens of the le

In [95]:
# TF-IDF vectorizer
fnames = [fname for fname, content in data.items()]
contents = [content for fname, content in data.items()]

t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.01, stop_words='english',
                             use_idf=True, lowercase=True, preprocessor=preprocess_document)
X = vectorizer.fit_transform(contents)
print("done in %fs" % (time() - t0))

In [96]:
X.shape

(3204, 3528)

In [88]:
len(fnames)

3204

## Find similar concept

In [153]:
cv_itemids = [idx for idx in concept_webpage_dict.keys() if idx <= 220000]

In [157]:
concerned_id = cv_itemids[1]
print('Item[%s]=%s' % (concerned_id, item2label[concerned_id]))
scores = list()
for idx in concept_definitions.keys():
    if idx != concerned_id and idx in item2label.keys():
        s = fuzz.ratio(item2label[concerned_id], item2label[idx])
        scores.append((idx, s, item2label[idx]))

Item[6469]=nephostomy


In [159]:
scores = sorted(scores, key=operator.itemgetter(1), reverse=True)
scores[:5]

[(6782, 95, 'nephrostomy'),
 (6414, 86, 'Nephrostomy'),
 (6021, 80, '(R) nephrostomy'),
 (7061, 77, 'left nephrostomy'),
 (5864, 75, 'ostomy')]

In [127]:
concerned_id

2098

## Cluster

In [92]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import sys
from time import time

In [97]:
n_components = 2
minibatch = True
verbose = 2
true_k = nb_clusters

In [98]:
print("Performing dimensionality reduction using LSA")
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X_reduce = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

print()

Performing dimensionality reduction using LSA
done in 0.061733s
Explained variance of the SVD step: 2%



In [99]:
X_reduce.shape

(3204, 2)

In [102]:
# #############################################################################
# Do the actual clustering

if minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=verbose)
else:
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X_reduce)
print("done in %0.3fs" % (time() - t0))
print()

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=50,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=2)
Init 1/1 with method: k-means++
Inertia for init 1/1: 0.113961
Minibatch iteration 1/400: mean batch inertia: 0.000135, ewa inertia: 0.000135 
Minibatch iteration 2/400: mean batch inertia: 0.000119, ewa inertia: 0.000125 
Minibatch iteration 3/400: mean batch inertia: 0.000131, ewa inertia: 0.000129 
Minibatch iteration 4/400: mean batch inertia: 0.000128, ewa inertia: 0.000128 
Minibatch iteration 5/400: mean batch inertia: 0.000117, ewa inertia: 0.000121 
Minibatch iteration 6/400: mean batch inertia: 0.000124, ewa inertia: 0.000123 
Minibatch iteration 7/400: mean batch inertia: 0.000119, ewa inertia: 0.000120 
Minibatch iteration 8/400: mean batch inertia: 0.000118, ewa inertia: 0.000119 
Minibatch iteration 9/400: mean

In [103]:
# #############################################################################
# metric

# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
# print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
# print("Adjusted Rand-Index: %.3f"
#       % metrics.adjusted_rand_score(labels, km.labels_))

# The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. 
# Scores around zero indicate overlapping clusters.
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X_reduce, km.labels_, sample_size=1000))

print()

Silhouette Coefficient: 0.540



In [104]:
print("Top terms per cluster:")
if n_components:
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
else:
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: medic doctor health tube drug skin care seizur treatment infect
Cluster 1: patient wa pressur ventil blood medic care tube effect time
Cluster 2: ventil pressur patient volum wa lung breath airway increas blood
Cluster 3: patient medic tube wa care doctor health drug skin treatment
Cluster 4: ventil pressur patient volum lung breath airway peep respiratori wa
Cluster 5: patient wa medic care tube blood doctor effect health drug
Cluster 6: medic patient tube doctor health care drug skin wa treatment
Cluster 7: zone expens explos explor explan explain expiratori expir expertis expert
Cluster 8: patient pressur ventil wa blood volum increas breath care lung
Cluster 9: patient medic wa tube care doctor blood health drug skin
Cluster 10: patient ventil pressur wa volum breath lung blood increas respiratori
Cluster 11: patient wa medic care blood tube effect doctor pressur treatment
Cluster 12: ventil pressur patient volum lung breath airway peep respiratori

Counter({18: 32,
         47: 76,
         10: 39,
         46: 68,
         26: 65,
         5: 92,
         8: 61,
         21: 82,
         16: 63,
         35: 64,
         38: 76,
         49: 29,
         23: 143,
         9: 87,
         48: 87,
         17: 111,
         40: 67,
         19: 104,
         20: 34,
         25: 85,
         11: 84,
         14: 171,
         39: 70,
         37: 81,
         29: 100,
         41: 43,
         1: 72,
         30: 45,
         12: 35,
         15: 67,
         27: 141,
         43: 91,
         45: 27,
         4: 31,
         42: 90,
         22: 43,
         28: 27,
         3: 155,
         32: 46,
         33: 55,
         6: 63,
         0: 12,
         13: 38,
         34: 34,
         36: 33,
         31: 22,
         2: 25,
         44: 13,
         7: 7,
         24: 18})