In [22]:
from n2c2_tokenizer import build_n2c2_tokenizer #University of Utah code. Credit to Kelly and Jianlin
import time, os, sys, multiprocessing, nltk, itertools
from collections import Counter
from multiprocessing import Pool
from sqlalchemy import create_engine, MetaData, Table, select

In [None]:
'''
--> Prepare input data and a vocabulary file.
Train the biLM.
Test (compute the perplexity of) the biLM on heldout data.
Write out the weights from the trained biLM to a hdf5 file.
See the instructions above for using the output from Step #4 in downstream models.
'''

In [2]:
'''Step 0: Initalize our tokenizer for MIMIC data'''

ENABLE_PYRUSH_SENTENCE_TOKENIZER = False

n2c2_tokenizer = build_n2c2_tokenizer(enable_pyrush_sentence_tokenizer = ENABLE_PYRUSH_SENTENCE_TOKENIZER,
                                     disable_custom_preprocessing = ENABLE_PYRUSH_SENTENCE_TOKENIZER, keep_token_strings=True)

tokenized_doc_example = n2c2_tokenizer.tokenize_document("I am a simple document. here are my sentences. nlp is the best.")

print(tokenized_doc_example.sentence_tokens_list)

Building n2c2 tokenizer...
('.', '!')
Enabling NLTK Punkt for sentence tokenization...
Type of sentence tokenizer : <class 'nltk.tokenize.punkt.PunktSentenceTokenizer'>
Enabling custom preprocessing expressions.  Total : 8
Class type initialized for ClinicalSentenceTokenizer for sentence tokenization : <class 'nltk.tokenize.punkt.PunktSentenceTokenizer'>
Compiled 8 total preprocessing regular expressions
Class type initialized for IndexTokenizer for sentence tokenization: <class 'clinical_tokenizers.ClinicalSentenceTokenizer'>
[['I', 'am', 'a', 'simple', 'document', '.'], ['here', 'are', 'my', 'sentences', '.'], ['nlp', 'is', 'the', 'best', '.']]


In [4]:
'''Step 1: Load the Mimic data. I have my Mimic data in an sqlite database. 
For how to do this, see: https://github.com/hclent/PyPatent/blob/master/readMimic.py'''

def getMimicTexts():
    '''
    Input: N/A
    Output: List[Strings] for all 2 million+ MIMIC texts **lowercase**. 
    We're going to use this List[Strings] to create the set of vocabulary words that is needed for BiLM.
    '''
    t1 = time.time() #start timer
    
    engine = create_engine('sqlite:///mimic.db') #initiated database engine
    conn = engine.connect()
    metadata = MetaData(bind=engine) #init metadata. will be empty
    metadata.reflect(engine) #retrieve db info for metadata (tables, columns, types)
    mydata = Table('mydata', metadata)

    data: list[string] = []

    #Query db for text. Not efficient. You can only execute one statment at a time with sqllite. Soz bro.   
    s = select([mydata.c.TEXT]) 
    print(type(s))
    result = conn.execute(s)
    print(type(result))
    for row in result:
        #text
        the_text = row["TEXT"]
        keep_text = the_text.rstrip()
        lower_text = keep_text.lower() #lowercase v important.
        # NB: tokenization will happen later. It is too slow to *NOT* run in parallel. 
        data.append(lower_text)
    
    print(" * Finished step0: done in %0.3fs." % (time.time() - t1))
    #Takes less than 1 minute. 
    return data

In [5]:
list_of_all_docs = getMimicTexts()

<class 'sqlalchemy.sql.selectable.Select'>
<class 'sqlalchemy.engine.result.ResultProxy'>
 * Finished step0: done in 48.217s.


In [6]:
#### Example ##### 
print("* list_of_all_docs is a: ", type(list_of_all_docs))
print("* number docs in list_of_all_docs: ", len(list_of_all_docs))
print("* documents in list_of_all_docs are: ", type(list_of_all_docs[0]))
# print("* Example documents: ", list_of_all_docs[0])

* list_of_all_docs is a:  <class 'list'>
* number docs in list_of_all_docs:  2083180
* documents in list_of_all_docs are:  <class 'str'>


In [20]:
#TODO: we also want to output data.txt as a nice string so should probably output that here honestly.
'''Step 2: Create a helper function to run with multiprocessing that will tokenize the document, 
create the set of tokens, and format the sentences for output to data.txt.'''
def getSetOfWords(document):
    '''
    Input: String of the document
    Output: Set(Strings) = This will output the set of tokens in the document
    TODO: We also should create the nice, pretty strings for data.txt here ...
    '''
    #tokenize
    tokenized = n2c2_tokenizer.tokenize_document(document).sentence_tokens_list #list of lists of tokens
    #format sentences for data.txt
    pretty_sentences = [' '.join(sentences) for sentences in tokenized]
    
    #flatten the list of lists into one list of strings 
    flatten = list(itertools.chain(*tokenized))
    
    '''
    PROBLEM:
    IMPORTANT: the vocabulary file should be sorted in descending order by token count in your training data. 
    The first three lines should be the special tokens (<S>, </S> and <UNK>), 
    ****then the most common token in the training data, ending with the least common token.****
    I didn;t see this before! Now we need to count things goddammit 
    '''

    return_dict = {'tokens': flatten, 'sentences': pretty_sentences}
        
    return return_dict

unique_words_example = getSetOfWords(list_of_all_docs[0])
#print(unique_words_example["set"])
#print("#"*20)
#print(unique_words_example["sentences"])

In [27]:
'''Step 3: Run the helper function asynchronously with multiprocessing to create the vocab.txt and data.txt 
that is necessary to run BiLM.
'''
def createVocabFile():
    '''
    Input: String = Point it at the text file that contains all of the mimic files!
    '''
    t1 = time.time() #start the timer
    
    pool_size = multiprocessing.cpu_count() #NOTE: Usin' all yer CPU's my friend. Change this if you want.
    pool = Pool(pool_size)
    print('* created worker pools')
    results0 = pool.map_async(getSetOfWords, list_of_all_docs[0:10])  #TODO: this will be the whole set, not a subset
    print('* initialized map_async to naiveSearchText function with docs')
    print('* did map to getSetOfWords function with docs. WITH async')
    pool.close()
    print('* closed pool')
    pool.join()
    print('* joined pool')
    list_of_dicts = [r for r in results0.get() if r is not None] # A BUNCH OF SETS
    print("Number of dictionaries created: ", len(list_of_dicts))

    """Step A: create data.txt: Should have 1 sentence per line"""
    #get all doc's sentences
    document_sentences = [s["sentences"]  for s in list_of_dicts]
    #flatten to one big list of sentences
    flatten_sents: list[string] = list(itertools.chain(*document_sentences))
    #output to data_vocab.txt
    with open("mimic_data.txt", "w") as out:
        for sent in flatten_sents:
            out.write(sent)
            out.write("\n")
    
    """Step B: create vocab.txt: Should have 1 token per line, as well as AllenNLP special tokens."""
    all_tokens = [d["tokens"] for d in list_of_dicts]
    all_flatten = list(itertools.chain(*all_tokens))
    output_counter = Counter(all_flatten)
    print(output_counter)
    
    #we also need to add these AllenNLP specific things
    allen_specific = ['<S>','</S>','<UNK>'] #these need to go at the top 

    #now output to vocab.txt
    with open("mimic_vocab.txt", "w") as out:
        for special in allen_specific:
            out.write(special)
            out.write("\n")
        for token, count in output_counter.most_common():
            out.write(token)
            out.write("\n")
    
    
    print(" * Created mimic_data.txt & mimic_vocab.txt: done in %0.3fs." % (time.time() - t1))


createVocabFile()

* created worker pools
* initialized map_async to naiveSearchText function with docs
* did map to getSetOfWords function with docs. WITH async
* closed pool
* joined pool
Number of dictionaries created:  10
 * Created mimic_data.txt & mimic_vocab.txt: done in 0.172s.


In [None]:
# 10 docs in 0.176s.
# 100 docs in 0.737s
# 1,000 docs in 
# 10,000 docs in 

In [None]:
'''
Prepare input data and a vocabulary file.
--> Train the biLM.
Test (compute the perplexity of) the biLM on heldout data.
Write out the weights from the trained biLM to a hdf5 file.
See the instructions above for using the output from Step #4 in downstream models.
'''