In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


In [2]:
df_train=pd.read_csv('/scratch/kh2383/Mortality/data/nursing.csv')

In [3]:
df_train = df_train[df_train.Label == 1]

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(df_train.TEXT.values))

In [5]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['multiplemyeloma', 'dx', 'neuropathy', 'bed', 'bound', 'care', 'for', 'by', 'dr', 'at', 'last', 'seen', 'at', 'in', 'gib', 'pt', 'wa', 'in', 'ush', 'at', 'nurs', 'home', 'when', 'maroon', 'stool', 'were', 'note', 'by', 'staff', 'members', 'patient', 'himself', 'wa', 'unawar', 'of', 'rectal', 'bleeding', 'he', 'deni', 'gi', 'symptoms', 'he', 'report', 'slight', 'lightheadedness', 'he', 'wa', 'transfer', 'from', 'nh', 'to', 'ed', 'where', 'his', 'hct', 'wa', 'and', 'plts', 'his', 'ngt', 'lavag', 'wa', 'neg', 'he', 'wa', 'given', 'unit', 'of', 'prbcs', 'and', 'bag', 'of', 'plts', 'gastrointestin', 'bleed', 'lower_hematochezia_brbpr', 'gi', 'bleed', 'gib', 'assessment', 'pt', 'requir', 'addit', 'unit', 'of', 'blood', 'when', 'he', 'arriv', 'in', 'the', 'micu', 'becaus', 'his', 'hct', 'onli', 'bump', 'to', 'after', 'the', 'unit', 'if', 'prbcs', 'that', 'he', 'receiv', 'in', 'the', 'ew', 'and', 'an', 'addit', 'bag', 'of', 'plt', 'for', 'plt_count', 'of', 'he', 'had', 'an', 'egd', 'no', 'ble

In [6]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [7]:
from gensim.utils import simple_preprocess
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

import spacy

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['multiplemyeloma', 'neuropathy', 'bed', 'bind', 'care', 'last', 'see', 'gib', 'ush', 'nur', 'home', 'maroon', 'stool', 'note', 'staff', 'member', 'patient', 'wa', 'unawar', 'rectal', 'bleeding', 'deni', 'symptom', 'report', 'slight', 'lightheadedness', 'wa', 'transfer', 'ed', 'hct', 'wa', 'plt', 'ngt', 'lavag', 'wa', 'neg', 'wa', 'give', 'unit', 'prbcs', 'bag', 'plt', 'gastrointestin', 'bleed', 'low', 'gi', 'bleed', 'gib', 'assessment', 'requir', 'addit', 'unit', 'blood', 'arriv', 'micu', 'becaus', 'hct', 'onli', 'bump', 'unit', 'prbcs', 'receiv', 'ew', 'addit', 'bag', 'plt', 'egd', 'bleed', 'wa', 'note', 'liter', 'golytley', 'colonoscopi', 'hct', 'morn', 'wa', 'give', 'addit', 'unit', 'blood', 'plt', 'morn', 'action', 'response', 'plan', 'colonoscopi', 'swallow', 'capsule', 'follow', 'hct', 'pain', 'control', 'acut', 'pain', 'chronic', 'pain', 'assessment', 'chronic', 'back', 'pain', 'take', 'contin', 'morphin', 'ir', 'give', 'contin', 'qhr', 'mg', 'be', 'morphin', 'breakthrough', '

In [8]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 3), (2, 1), (3, 3), (4, 1), (5, 1), (6, 3), (7, 1), (8, 2), (9, 2), (10, 1), (11, 1), (12, 2), (13, 1), (14, 3), (15, 1), (16, 2), (17, 1), (18, 1), (19, 2), (20, 1), (21, 2), (22, 1), (23, 2), (24, 2), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 1), (40, 1), (41, 2), (42, 3), (43, 1), (44, 4), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 2), (61, 3), (62, 1), (63, 1), (64, 1), (65, 1), (66, 3), (67, 1), (68, 1), (69, 1), (70, 1), (71, 6), (72, 1), (73, 1), (74, 3), (75, 4), (76, 2), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 3), (89, 2), (90, 1), (91, 2), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 4), (105, 1), (106, 8), (107, 1), (108, 1), (109, 1)]]


In [9]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=50, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [10]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(32,
  '0.470*"unrespon" + 0.148*"indicate" + 0.085*"imag" + 0.076*"benadryl" + '
  '0.000*"antibioit" + 0.000*"tnmx" + 0.000*"pipercillian" + '
  '0.000*"precutan_biliari" + 0.000*"patchi_infiltr" + 0.000*"brushing"'),
 (31,
  '0.648*"skin" + 0.160*"integr" + 0.039*"captopril" + 0.000*"pipercillian" + '
  '0.000*"precutan" + 0.000*"technic_difficulty" + 0.000*"tnmx" + '
  '0.000*"ptc_biliari" + 0.000*"nutrapho" + 0.000*"brushing"'),
 (25,
  '0.520*"rt" + 0.224*"transplant" + 0.127*"lt" + 0.000*"pipercillian" + '
  '0.000*"precutan" + 0.000*"tnmx" + 0.000*"somach" + 0.000*"antibioit" + '
  '0.000*"nutrapho" + 0.000*"brushing"'),
 (0,
  '0.347*"mass" + 0.299*"seizur" + 0.121*"dilantin" + 0.087*"seizure" + '
  '0.067*"effus" + 0.004*"lymph_node" + 0.002*"level" + 0.000*"precutan" + '
  '0.000*"cellphon" + 0.000*"precutan_biliari"'),
 (8,
  '0.249*"pull" + 0.243*"desat" + 0.178*"deep" + 0.103*"droplet_precaut" + '
  '0.031*"venous_thrombosi" + 0.027*"obstruct_intestin" + '
  '0.027*"obst

In [11]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -19.4654276579

Coherence Score:  0.460404871051


In [12]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')
vis


In [13]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = './mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=50, id2word=id2word)

In [14]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


[(25,
  [('drain', 0.066939952250101353),
   ('tube', 0.039371142844272261),
   ('fluid', 0.028154421370331997),
   ('small', 0.026277459945643199),
   ('drainag', 0.020571497214589247),
   ('bowel', 0.018304127813565177),
   ('output', 0.016216946709311231),
   ('ngt', 0.016066789795336125),
   ('monitor', 0.014099734222262263),
   ('site', 0.014024655765274712)]),
 (44,
  [('neuro', 0.086891047360891496),
   ('exam', 0.031860767799326645),
   ('check', 0.029107102520249121),
   ('pupil', 0.027095636085922963),
   ('head', 0.026826723461013047),
   ('command', 0.018275301988877776),
   ('seizur', 0.017963363343982273),
   ('sbp', 0.014951541944991233),
   ('eye', 0.014897759420009251),
   ('move', 0.014338421160196629)]),
 (27,
  [('sbp', 0.043739060676779462),
   ('gtt', 0.031778733955659277),
   ('wean', 0.026454929988331388),
   ('start', 0.015096266044340723),
   ('bp', 0.014822782963827304),
   ('plan', 0.014585764294049008),
   ('action', 0.013418903150525088),
   ('titrat', 0.0

In [15]:
ldamallet.print_topics()

[(11,
  '0.071*"insulin" + 0.048*"blood" + 0.034*"plan" + 0.033*"unit" + 0.032*"action" + 0.031*"response" + 0.029*"assessment" + 0.028*"start" + 0.028*"receiv" + 0.027*"sugar"'),
 (31,
  '0.085*"sit" + 0.038*"place" + 0.031*"bipap" + 0.029*"mask" + 0.029*"high" + 0.028*"give" + 0.026*"nrb" + 0.022*"rr" + 0.021*"low" + 0.019*"cpap"'),
 (20,
  '0.074*"sedat" + 0.056*"intub" + 0.053*"vent" + 0.047*"fentanyl" + 0.036*"wean" + 0.032*"ver" + 0.030*"propofol" + 0.028*"abg" + 0.023*"mcg" + 0.016*"extub"'),
 (34,
  '0.047*"cxr" + 0.041*"show" + 0.028*"pna" + 0.025*"pneumonia" + 0.023*"chest" + 0.019*"transfer" + 0.017*"bilater" + 0.016*"worsen" + 0.014*"aspir" + 0.014*"present"'),
 (48,
  '0.211*"continu" + 0.157*"monitor" + 0.055*"order" + 0.051*"remain" + 0.037*"assessment" + 0.033*"plan" + 0.029*"action" + 0.028*"response" + 0.028*"status" + 0.026*"hemodynam"'),
 (22,
  '0.059*"abdomin" + 0.040*"pain" + 0.024*"ercp" + 0.016*"abdoman" + 0.013*"bowel" + 0.013*"distend" + 0.012*"fluid" + 0.012