SCRIPT TO TRAIN W2V/TFIDF ON TOKENIZED TEXT

In [1]:
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec, FastText
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import TfidfModel
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
#skip

In [None]:
# spacy
if os.getcwd() == '/content': # on google drive
    import spacy
    nlp = spacy.load('en')
else: # on CRIS azure
    import scispacy
    import spacy
    nlp = spacy.load('en_core_sci_md')
    

In [3]:
from spacy.lang.en.examples import sentences

In [4]:
import thinc

In [None]:
!pip install spacy && python -m spacy download en

In [6]:
spacy.prefer_gpu()

False

In [None]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz --no-deps

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
#nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
#nlp = spacy.load('en',parse=True,tag=True, entity=True)

## PREPARE TEXT FILE

In [9]:
def tokenize_spacy(df, text_col=None, tokenization_type='clean', outfile=None):
    tok_snts = []
    if outfile is not None: f = open(outfile, 'w', encoding='utf8')
    data = df if text_col is None else df[text_col]
    for snt in data:
        tkns = nlp.tokenizer(snt)
        if ('low' in tokenization_type) and ('wos' in tokenization_type):
            _tkns = [str(x.text).lower() for x in tkns if not x.is_space]
        elif 'wos' in tokenization_type:
            _tkns = [str(x.text) for x in tkns if not x.is_space]
        elif 'lem' in tokenization_type:
            _tkns = [str(x.lemma_).lower() for x in tkns if not x.is_space and not x.is_punct]
        elif 'stem' in tokenization_type:
            _tkns = [stemmer.stem(str(x.text).lower()) for x in tkns if not x.is_space and not x.is_punct]
        else: # clean by default
            _tkns = [str(x.text).lower() for x in tkns if not x.is_space and not x.is_punct]
        
        if outfile is not None: # flush to file if option selected
            f.write("{}\n".format("\t".join(_tkns)))
        else: # otherwise save in variable
            tok_snts.append(_tkns)

    return tok_snts if outfile is None else None


In [None]:
# PREPARE THE TXT FILE
raw_file = './NOTEEVENTS.csv'
df = pd.read_csv(raw_file)
sentences = tokenize_spacy(df, text_col=None, tokenization_type='clean', outfile=raw_file)
#sentences = tokenize_spacy(df, text_col=None, tokenization_type='clean', outfile=raw_file, replace('.csv','_clean.txt'))

In [4]:
#resume

In [2]:
df = pd.read_csv('NOTEEVENTS_final_clean.csv.gz', compression='gzip')

In [3]:
df.head()

Unnamed: 0,lower_no_punc_tokens
0,'76' 'yo' 'm' 'initially' 'admitted' 'to' '' '...
1,'54yr' 'man' 'with' 'hx' 'metastatic' 'renal' ...
2,'chief' 'complaint' '' '24' 'hour' 'events' ''...
3,'chief' 'complaint' '' 'acute' 'hepatitis' 'hp...
4,'chief' 'complaint' '' 'hpi' '' '24' 'hour' 'e...


In [4]:
type(df)

pandas.core.frame.DataFrame

In [7]:
df = df.astype(str) 

In [5]:
numpy_array = df.to_numpy()

In [9]:
np.savetxt("noteevents_clean.txt", numpy_array, fmt='%s')

## INPUT DATA

In [11]:
# INPUT PARAMETERS

# tokenized text
#filename = '/home/ubuntu/data/mimic/tokenized_scispacy/noteevents_tkns_stem.txt' # with stemming
filename = './noteevents_clean.txt'

# w2v/fasttext params
size=300
window=10
min_count=5
workers=16

# sklearn - tfidf params
min_df=0.2
max_df=0.9
ngram_range=(1, 5)
max_features=1000

# gensim - tfidf params

In [12]:
class SentenceIterator: 
    def __init__(self, filepath): 
        self.filepath = filepath 

    def __iter__(self): 
        for line in open(self.filepath): 
            yield line.split("\t") 

In [13]:
sentences = SentenceIterator(filename) 


## TRAIN GLOVE

In [None]:
!pip install glove

In [None]:
# TRAIN AND SAVE GLOVE
import glove
from glove import Corpus, Glove


corpus = Corpus() #Creating a corpus object
corpus.fit(sentences, window=window) #Training the corpus to generate the co occurence matrix which is used in GloVe

glove = Glove(no_components=5, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save(filename.replace('.txt','_glove.model').replace('_tkns',''))

## TRAIN FASTEXT

In [16]:
# TRAIN AND SAVE FASTTEXT
ft_model = FastText(size=size, window=window, min_count=min_count)  # instantiate
ft_model.build_vocab(sentences=sentences)
ft_model.train(sentences=sentences, total_examples=sum(1 for _ in sentences), epochs=2)
#ft_model.save(filename.replace('.txt','_fasttext_win10_min5.dat').replace('_tkns',''))
ft_model.save(filename.replace('.txt','_fasttext_win10_min5.dat'))

2021-02-22 10:58:57,837 : INFO : resetting layer weights
2021-02-22 10:59:13,126 : INFO : collecting all words and their counts
2021-02-22 10:59:13,128 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-22 10:59:13,262 : INFO : PROGRESS: at sentence #10000, processed 10000 words, keeping 9349 word types
2021-02-22 10:59:13,398 : INFO : PROGRESS: at sentence #20000, processed 20000 words, keeping 18556 word types
2021-02-22 10:59:13,529 : INFO : PROGRESS: at sentence #30000, processed 30000 words, keeping 27752 word types
2021-02-22 10:59:13,628 : INFO : PROGRESS: at sentence #40000, processed 40000 words, keeping 37343 word types
2021-02-22 10:59:13,695 : INFO : PROGRESS: at sentence #50000, processed 50000 words, keeping 47343 word types
2021-02-22 10:59:13,761 : INFO : PROGRESS: at sentence #60000, processed 60000 words, keeping 57343 word types
2021-02-22 10:59:13,827 : INFO : PROGRESS: at sentence #70000, processed 70000 words, keeping 67343 word typ

2021-02-22 10:59:17,508 : INFO : PROGRESS: at sentence #720000, processed 720000 words, keeping 712016 word types
2021-02-22 10:59:17,566 : INFO : PROGRESS: at sentence #730000, processed 730000 words, keeping 721827 word types
2021-02-22 10:59:17,851 : INFO : PROGRESS: at sentence #740000, processed 740000 words, keeping 731787 word types
2021-02-22 10:59:18,167 : INFO : PROGRESS: at sentence #750000, processed 750000 words, keeping 741771 word types
2021-02-22 10:59:18,516 : INFO : PROGRESS: at sentence #760000, processed 760000 words, keeping 751761 word types
2021-02-22 10:59:18,605 : INFO : PROGRESS: at sentence #770000, processed 770000 words, keeping 761756 word types
2021-02-22 10:59:18,699 : INFO : PROGRESS: at sentence #780000, processed 780000 words, keeping 771742 word types
2021-02-22 10:59:18,758 : INFO : PROGRESS: at sentence #790000, processed 790000 words, keeping 781413 word types
2021-02-22 10:59:18,781 : INFO : PROGRESS: at sentence #800000, processed 800000 words, 

2021-02-22 10:59:23,417 : INFO : PROGRESS: at sentence #1430000, processed 1430000 words, keeping 1395642 word types
2021-02-22 10:59:23,542 : INFO : PROGRESS: at sentence #1440000, processed 1440000 words, keeping 1405507 word types
2021-02-22 10:59:23,580 : INFO : PROGRESS: at sentence #1450000, processed 1450000 words, keeping 1415287 word types
2021-02-22 10:59:23,618 : INFO : PROGRESS: at sentence #1460000, processed 1460000 words, keeping 1425039 word types
2021-02-22 10:59:23,658 : INFO : PROGRESS: at sentence #1470000, processed 1470000 words, keeping 1434785 word types
2021-02-22 10:59:23,696 : INFO : PROGRESS: at sentence #1480000, processed 1480000 words, keeping 1444531 word types
2021-02-22 10:59:23,735 : INFO : PROGRESS: at sentence #1490000, processed 1490000 words, keeping 1454304 word types
2021-02-22 10:59:23,774 : INFO : PROGRESS: at sentence #1500000, processed 1500000 words, keeping 1464090 word types
2021-02-22 10:59:23,812 : INFO : PROGRESS: at sentence #1510000,

2021-02-22 10:59:29,173 : INFO : sample=0.001 downsamples 38 most-common words
2021-02-22 10:59:29,174 : INFO : downsampling leaves estimated 15132 word corpus (64.7% of prior 23371)
2021-02-22 10:59:29,656 : INFO : estimated required memory for 1176 words, 78129 buckets and 300 dimensions: 110096912 bytes
2021-02-22 10:59:29,658 : INFO : resetting layer weights
2021-02-22 10:59:45,575 : INFO : training model with 3 workers on 1176 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2021-02-22 10:59:46,579 : INFO : EPOCH 1 - PROGRESS: at 4.80% examples, 304 words/s, in_qsize 0, out_qsize 0
2021-02-22 10:59:47,614 : INFO : EPOCH 1 - PROGRESS: at 11.52% examples, 149 words/s, in_qsize 0, out_qsize 0
2021-02-22 10:59:48,679 : INFO : EPOCH 1 - PROGRESS: at 18.72% examples, 165 words/s, in_qsize 0, out_qsize 0
2021-02-22 10:59:49,752 : INFO : EPOCH 1 - PROGRESS: at 27.84% examples, 333 words/s, in_qsize 0, out_qsize 0
2021-02-22 10:59:50,759 : INFO : EPOCH 1 - PRO

2021-02-22 10:59:28,104 : INFO : collected 2019963 word types from a corpus of 2083112 raw words and 2083112 sentences
2021-02-22 10:59:28,104 : INFO : Loading a fresh vocabulary
2021-02-22 10:59:28,901 : INFO : effective_min_count=5 retains 1176 unique words (0% of original 2019963, drops 2018787)
2021-02-22 10:59:28,902 : INFO : effective_min_count=5 leaves 23371 word corpus (1% of original 2083112, drops 2059741)
2021-02-22 10:59:28,906 : INFO : deleting the raw counts dictionary of 2019963 items
2021-02-22 10:59:29,173 : INFO : sample=0.001 downsamples 38 most-common words
2021-02-22 10:59:29,174 : INFO : downsampling leaves estimated 15132 word corpus (64.7% of prior 23371)
2021-02-22 10:59:29,656 : INFO : estimated required memory for 1176 words, 78129 buckets and 300 dimensions: 110096912 bytes
2021-02-22 10:59:29,658 : INFO : resetting layer weights
2021-02-22 10:59:45,575 : INFO : training model with 3 workers on 1176 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2021-02-22 11:00:02,367 : INFO : EPOCH - 1 : training on 2083112 raw words (15179 effective words) took 16.8s, 904 effective words/s
2021-02-22 11:00:19,226 : INFO : EPOCH - 2 : training on 2083112 raw words (15148 effective words) took 16.8s, 899 effective words/s
2021-02-22 11:00:19,227 : INFO : training on a 4166224 raw words (30327 effective words) took 33.7s, 901 effective words/s
2021-02-22 11:00:21,558 : INFO : saving FastText object under ./noteevents_clean_fasttext_win10_min5.dat, separately None
2021-02-22 11:00:21,559 : INFO : storing np array 'vectors_ngrams' to ./noteevents_clean_fasttext_win10_min5.dat.wv.vectors_ngrams.npy
2021-02-22 11:00:23,183 : INFO : not storing attribute vectors_norm
2021-02-22 11:00:23,184 : INFO : not storing attribute vectors_vocab_norm
2021-02-22 11:00:23,184 : INFO : not storing attribute vectors_ngrams_norm
2021-02-22 11:00:23,185 : INFO : not storing attribute buckets_word
2021-02-22 11:00:23,186 : INFO : storing np array 'vectors_ngrams_lockf' to ./noteevents_clean_fasttext_win10_min5.dat.trainables.vectors_ngrams_lockf.npy
2021-02-22 11:00:24,810 : INFO : saved ./noteevents_clean_fasttext_win10_min5.dat

## TRAIN W2v

In [18]:
# TRAIN AND SAVE W2V
w2v_model = Word2Vec(sentences, size=size, window=window, min_count=min_count, workers=workers)
w2v_model.save(filename.replace('.txt','_w2v_win10_min5.dat').replace('_tkns',''))

# test
w2v_model.wv.most_similar("attention")

2021-02-22 11:04:33,865 : INFO : collecting all words and their counts
2021-02-22 11:04:33,868 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-22 11:04:34,021 : INFO : PROGRESS: at sentence #10000, processed 10000 words, keeping 9349 word types
2021-02-22 11:04:34,140 : INFO : PROGRESS: at sentence #20000, processed 20000 words, keeping 18556 word types
2021-02-22 11:04:34,254 : INFO : PROGRESS: at sentence #30000, processed 30000 words, keeping 27752 word types
2021-02-22 11:04:34,342 : INFO : PROGRESS: at sentence #40000, processed 40000 words, keeping 37343 word types
2021-02-22 11:04:34,401 : INFO : PROGRESS: at sentence #50000, processed 50000 words, keeping 47343 word types
2021-02-22 11:04:34,461 : INFO : PROGRESS: at sentence #60000, processed 60000 words, keeping 57343 word types
2021-02-22 11:04:34,518 : INFO : PROGRESS: at sentence #70000, processed 70000 words, keeping 67343 word types
2021-02-22 11:04:34,582 : INFO : PROGRESS: at sentence

2021-02-22 11:04:37,680 : INFO : PROGRESS: at sentence #720000, processed 720000 words, keeping 712016 word types
2021-02-22 11:04:37,729 : INFO : PROGRESS: at sentence #730000, processed 730000 words, keeping 721827 word types
2021-02-22 11:04:37,958 : INFO : PROGRESS: at sentence #740000, processed 740000 words, keeping 731787 word types
2021-02-22 11:04:38,238 : INFO : PROGRESS: at sentence #750000, processed 750000 words, keeping 741771 word types
2021-02-22 11:04:38,536 : INFO : PROGRESS: at sentence #760000, processed 760000 words, keeping 751761 word types
2021-02-22 11:04:38,620 : INFO : PROGRESS: at sentence #770000, processed 770000 words, keeping 761756 word types
2021-02-22 11:04:38,707 : INFO : PROGRESS: at sentence #780000, processed 780000 words, keeping 771742 word types
2021-02-22 11:04:38,765 : INFO : PROGRESS: at sentence #790000, processed 790000 words, keeping 781413 word types
2021-02-22 11:04:38,789 : INFO : PROGRESS: at sentence #800000, processed 800000 words, 

2021-02-22 11:04:42,932 : INFO : PROGRESS: at sentence #1430000, processed 1430000 words, keeping 1395642 word types
2021-02-22 11:04:43,020 : INFO : PROGRESS: at sentence #1440000, processed 1440000 words, keeping 1405507 word types
2021-02-22 11:04:43,053 : INFO : PROGRESS: at sentence #1450000, processed 1450000 words, keeping 1415287 word types
2021-02-22 11:04:43,087 : INFO : PROGRESS: at sentence #1460000, processed 1460000 words, keeping 1425039 word types
2021-02-22 11:04:43,120 : INFO : PROGRESS: at sentence #1470000, processed 1470000 words, keeping 1434785 word types
2021-02-22 11:04:43,154 : INFO : PROGRESS: at sentence #1480000, processed 1480000 words, keeping 1444531 word types
2021-02-22 11:04:43,192 : INFO : PROGRESS: at sentence #1490000, processed 1490000 words, keeping 1454304 word types
2021-02-22 11:04:43,226 : INFO : PROGRESS: at sentence #1500000, processed 1500000 words, keeping 1464090 word types
2021-02-22 11:04:43,261 : INFO : PROGRESS: at sentence #1510000,

2021-02-22 11:04:47,993 : INFO : sample=0.001 downsamples 38 most-common words
2021-02-22 11:04:47,994 : INFO : downsampling leaves estimated 15132 word corpus (64.7% of prior 23371)
2021-02-22 11:04:47,997 : INFO : estimated required memory for 1176 words and 300 dimensions: 3410400 bytes
2021-02-22 11:04:47,998 : INFO : resetting layer weights
2021-02-22 11:04:48,240 : INFO : training model with 16 workers on 1176 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2021-02-22 11:04:49,249 : INFO : EPOCH 1 - PROGRESS: at 4.80% examples, 303 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:04:50,252 : INFO : EPOCH 1 - PROGRESS: at 11.04% examples, 152 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:04:51,266 : INFO : EPOCH 1 - PROGRESS: at 17.28% examples, 143 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:04:52,388 : INFO : EPOCH 1 - PROGRESS: at 25.92% examples, 257 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:04:53,476 : INFO : EPOCH 1 - PROGRESS: at 35.04%

2021-02-22 11:05:32,867 : INFO : EPOCH 3 - PROGRESS: at 49.45% examples, 859 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:05:33,944 : INFO : EPOCH 3 - PROGRESS: at 53.29% examples, 834 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:05:34,953 : INFO : EPOCH 3 - PROGRESS: at 59.53% examples, 754 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:05:36,032 : INFO : EPOCH 3 - PROGRESS: at 65.77% examples, 687 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:05:37,033 : INFO : EPOCH 3 - PROGRESS: at 73.93% examples, 716 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:05:38,100 : INFO : EPOCH 3 - PROGRESS: at 75.85% examples, 660 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:05:39,109 : INFO : EPOCH 3 - PROGRESS: at 81.13% examples, 729 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:05:40,193 : INFO : EPOCH 3 - PROGRESS: at 84.97% examples, 722 words/s, in_qsize 0, out_qsize 0
2021-02-22 11:05:41,194 : INFO : EPOCH 3 - PROGRESS: at 91.21% examples, 705 words/s, in_qsize 0, out_qsize 0
2021-02-22

2021-02-22 11:06:17,613 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-02-22 11:06:17,614 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-02-22 11:06:17,614 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-02-22 11:06:17,615 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-22 11:06:17,616 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-02-22 11:06:17,616 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-02-22 11:06:17,617 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-22 11:06:17,617 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-22 11:06:17,618 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-22 11:06:17,619 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-22 11:06:17,620 : INFO : worker thread finished; awaiting finish of 0 more thread

KeyError: "word 'attention' not in vocabulary"

2021-02-22 11:04:46,937 : INFO : collected 2019963 word types from a corpus of 2083112 raw words and 2083112 sentences
2021-02-22 11:04:46,938 : INFO : Loading a fresh vocabulary
2021-02-22 11:04:47,731 : INFO : effective_min_count=5 retains 1176 unique words (0% of original 2019963, drops 2018787)
2021-02-22 11:04:47,732 : INFO : effective_min_count=5 leaves 23371 word corpus (1% of original 2083112, drops 2059741)
2021-02-22 11:04:47,736 : INFO : deleting the raw counts dictionary of 2019963 items
2021-02-22 11:04:47,993 : INFO : sample=0.001 downsamples 38 most-common words
2021-02-22 11:04:47,994 : INFO : downsampling leaves estimated 15132 word corpus (64.7% of prior 23371)
2021-02-22 11:04:47,997 : INFO : estimated required memory for 1176 words and 300 dimensions: 3410400 bytes
2021-02-22 11:04:47,998 : INFO : resetting layer weights
2021-02-22 11:04:48,240 : INFO : training model with 16 workers on 1176 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2021-02-22 11:05:06,369 : INFO : EPOCH - 1 : training on 2083112 raw words (15121 effective words) took 18.1s, 834 effective words/s
2021-02-22 11:05:24,363 : INFO : EPOCH - 2 : training on 2083112 raw words (15131 effective words) took 18.0s, 841 effective words/s
2021-02-22 11:05:42,383 : INFO : EPOCH - 3 : training on 2083112 raw words (15072 effective words) took 18.0s, 837 effective words/s
2021-02-22 11:05:59,885 : INFO : EPOCH - 4 : training on 2083112 raw words (15144 effective words) took 17.5s, 866 effective words/s
2021-02-22 11:06:17,620 : INFO : EPOCH - 5 : training on 2083112 raw words (15152 effective words) took 17.7s, 855 effective words/s
2021-02-22 11:06:17,621 : INFO : training on a 10415560 raw words (75620 effective words) took 89.4s, 846 effective words/s
2021-02-22 11:06:17,622 : INFO : saving Word2Vec object under ./noteevents_clean_fasttext_win10_min5.dat, separately None
2021-02-22 11:06:17,623 : INFO : not storing attribute vectors_norm
2021-02-22 11:06:17,624 : INFO : not storing attribute cum_table
2021-02-22 11:06:17,641 : INFO : saved ./noteevents_clean_fasttext_win10_min5.dat
2021-02-22 11:06:17,642 : INFO : precomputing L2-norms of word weight vectors

In [19]:
# test
w2v_model.wv.most_similar("pain")

KeyError: "word 'pain' not in vocabulary"

## TRAIN TFIDF

In [None]:
# TRAIN AND SAVE TFIDF (USING SKLEARN - DOESN'T SEEM TO WORK ON BIG DATASETS)
tfidf = TfidfVectorizer(min_df=min_df, max_df=max_df, ngram_range=ngram_range, sublinear_tf=True, use_idf=True, 
                        max_features=max_features, preprocessor=' '.join)
trained_tfidf = tfidf.fit(sentences)
with open(filename.replace('.txt','_tfidf.pickle'), 'wb') as fin:
          pickle.dump(trained_tfidf, fin)

In [None]:
# TRAIN AND SAVE TFIDF (UING GENSIM)

# Create the Dictionary and Corpus
mydict = corpora.Dictionary(sentences) # create dctionary
mydict.save(filename.replace('.txt','_tfidfdict.dict'))  # save dict to disk
corpus = [mydict.doc2bow(line) for line in sentences] # create corpus
corpora.MmCorpus.serialize(filename.replace('.txt','_tfidfcorpus.mm'), corpus)  # save corpus to disk

tfidf_model = TfidfModel(corpus)  # fit model
tfidf_model.save(filename.replace('.txt','_tfidf.model')) # save model