In [11]:
"""
Creates Word2Vec Model, and LDA Model
input: raw unstructured EMR data (['text'] column in our schema)
output:Word2Vec model (.bin) using sentences, and phrases; LDA model
Last update: 1.31.20
Author:  Andrew Malinow, PhD
"""

"\nCreates Word2Vec Model\ninput: raw unstructured EMR data (['text'] column in our schema)\noutput:Word2Vec model (.bin) using sentences, and phrases\nLast update: 1.31.20\nAuthor:  Andrew Malinow, PhD\n"

In [4]:
"""
Imports
""""
import requests
import gensim
from gensim.models import word2vec
import pandas as pd
import math
import time

In [5]:
"""
get data
"""
start_time=time.time()
json_count=requests.get('http://10.32.22.16:56733/noteeventscount').json()
count = json_count['note_count']
page_count = math.ceil(count/100000)
all_notes = []
for i in range(page_count):
    resp = requests.get('http://10.32.22.16:56733/noteevents/page/'+str(i+1))
    notes = resp.json()['json_notes']
    all_notes += notes
end_time=time.time()

KeyboardInterrupt: 

In [None]:
print ('time to retrieve all notes:', start_time-end_time)

In [None]:
print ('time to retrieve all notes:', start_time-end_time)

notes_text=sent_tokenize(str(all_notes))

In [None]:
print (len(notes_text))

In [None]:
new_sentences=notes_text

In [None]:
"""
validate data
(there should be more sentences than number of records)
"""
print (len(new_sentences))

In [None]:
"""
generate n-grams function
"""
def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence into tokens, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    tokens = [token for token in s.split(" ") if len(token)>=3]
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [None]:
"""
generate n-grams for new_sentences to prep for topic modeling
"""
ngrams=[]
for i in new_sentences:
    n=generate_ngrams(str(i),5)
    ngrams.append(n)

new_sentences_ngrams=ngrams

In [None]:
"""
turn ngrams into single 'words' by replacing " " with "_"
"""
sentences_ngrams_concat=[]
for i in new_sentences_ngrams:
    ngram_list=[]
    ngrams=new_sentences_ngrams
    for n in ngrams:
        n=str(n)
        a=str(n).replace(" ","_")
        ngram_list.append(a)
        continue
    sentences_ngrams_concat.append(ngram_list)
    continue
sentences_ngrams_concat=sentences_ngrams_concat

In [None]:
"""
pre-processing: tokenize 'sentences_ngrams_concat' for topic modeling
"""
ngrams_concat_tokens=[]
for n in sentences_ngrams_concat: 
    ngrams_concat_tokens.append(word_tokenize(str(n)))
ngram_tokens=ngrams_concat_tokens


In [None]:
"""
feature engineering-derived: use LDA on clinical ngrams concat
create dictionary and corpus and save for future use
"""
start=time.time()
dictionary = gensim.corpora.Dictionary(sentences_ngrams_concat)

#create corpus 
corpus = [dictionary.doc2bow(text) for text in sentences_ngrams_concat]
#save corpus and dictionary
pickle.dump(corpus, open('Default_n_grams-corpus.pkl', 'wb'))
dictionary.save('Default_Dictionary')
end=time.time()
print (end-start)

In [None]:
"""
create Word2Vec Model and save for future use
need to update location for saved model
"""

model = Word2Vec(ngram_concat_tokens, size=100, window=10, min_count=1, workers=3)
model.wv.save_word2vec_format('Word2VecModelSentences.bin', binary=True)

In [None]:
"""
Use topic modeling to extract themes
train and save an LDA model
use num_topics parameter to determine the number of topics for the model,
and num_words parameter for how much to show
"""
lda=gensim.models.LdaMulticore(corpus=corpus,num_topics=5,id2word=dictionary,passes=10,workers=3)
lda.save("mimic-lda-full_notes.model")
topics=lda.print_topics(num_words=4)


In [None]:
for n in topics:
    print (n)

In [None]:
print (model.wv.most_similar('heart'))

In [10]:
"""
Global Variables
"""
model_file='Word2VecModel.bin'
model=model = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=True,unicode_errors='ignore')