#### Training vector-based models

In [5]:
import pandas as pd
import numpy as np
from collections import defaultdict, namedtuple
import re
from tqdm.auto import tqdm

from nltk.tokenize import word_tokenize

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.callbacks import CallbackAny2Vec

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import tensorflow as tf
# import tensorflow.keras as K
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from eval_utils import create_metrics

In [6]:
tqdm.pandas()

tf.keras.backend.clear_session()

Define constants

In [None]:

dim=50
hidden_size = 128
inname = "c_nurse"
outname = "c_nurse"
per_category_limit = None

Method for downsampling unbalanced data

In [8]:
def limit_samples(df, group, max_count):
    return df.groupby(group).apply(lambda x: x if len(x) <= max_count else x.sample(max_count)).droplevel(0)

Loading datasets(segmented and normalized)

In [10]:
parts = pd.read_feather(f"dataset/{inname}-parts.feather")
titles = pd.read_feather(f"dataset/{inname}-titles.feather")

In [None]:
# downsample
relevant = parts
if per_category_limit is not None:
    relevant = limit_samples(parts, "label", per_category_limit)
relevant = relevant.query("label >= 0").reset_index(drop=True) # remove -1: unlabeled

In [15]:
relevant.head(3)


Unnamed: 0,rid,pid,rord,srord,text,stext,title,stitle,label
0,1678764,2,1,3,"PNS: A pos, Ab neg, HBSAg neg, RPR NR, RI, GB...","A pos, Ab neg, HBSAg neg, RPR NR, RI, GBS neg....",PNS,pns,183
1,1678764,2,1,6,Assessment/plan:\nTerm male infant with increa...,Term male infant with increased risk of sepsis...,Assessment/plan,assessment/plan,42
2,1260685,3,1,0,MICU NSG PROG NOTE: days\nRemains stable on hi...,"days\nRemains stable on high dose neo, taperin...",MICU NSG PROG NOTE,micu nsg prog note,700


Prepairing train/test

In [16]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [17]:
# only using the first fold
for train_index, test_index in skf.split(relevant, relevant.label):
    break

Tokenizers

In [20]:
def tokenize_doc(text):
    text = re.sub(r"[0-9]", "9", text) # replace all digits with 9
    text = re.sub(r"([\.\,\:])(?!#)", r" \1 ", text) # space around punctuation
    text = re.sub(r"\n", r" <br> ", text) # Replaces newlines with <br> tag : To preserve line breaks as meaningful information / avoid issues caused by invisible characters like \n
    return text.split()

def tokenize_nltk(text):
    text = re.sub(r"[0-9]", "9", text)
    text = re.sub(r"\n", r" <br> ", text)
    return word_tokenize(text) # NLTK's word_tokenize, which is smarter than .split()


In [19]:
# select the tokenizer
tokenize = tokenize_doc