Combine data from each segment to create a single dataset for DocVec

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess

from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE

In [3]:
inname = "c_nurse"
outname = "c_nurse"
per_category_limit = None

Loading datasets(segmented and normalized)

In [4]:
parts = pd.read_feather(f"dataset/{inname}-parts.feather")
titles = pd.read_feather(f"dataset/{inname}-titles.feather")

In [5]:
parts.head()

Unnamed: 0,rid,pid,rord,srord,text,stext,title,stitle,label
0,1678765,2,0,0,Nursing Transfer note,Nursing Transfer note,,,-1
1,1678765,2,0,1,Pt admitted to NICU for sepsis eval. Please se...,Pt admitted to NICU for sepsis eval. Please se...,,,-1
2,1678765,2,0,2,"Infant stable in RA. RR 30-40's, sats 96-100%....","Infant stable in RA. RR 30-40's, sats 96-100%....",,,-1
3,1678764,2,1,0,Neonatology Attending Triage Note,Neonatology Attending Triage Note,,,-1
4,1678764,2,1,1,Baby [**Name (NI) 1**] [**Known lastname 2**] ...,Baby [**Name (NI) 1**] [**Known lastname 2**] ...,,,-1


In [6]:
titles.head()

Unnamed: 0_level_0,title,freq
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,resp,172682
1,neuro,160003
2,cv,142843
3,gi,123485
4,plan,108943


In [7]:
def limit_samples(df, group, max_count):
    return df.groupby(group).apply(lambda x: x if len(x) <= max_count else x.sample(max_count)).droplevel(0)

In [8]:
# downsample
relevant = parts
if per_category_limit is not None:
    relevant = limit_samples(parts, "label", per_category_limit)
relevant = relevant.query("label >= 0").reset_index(drop=True) # remove -1: unlabeled

In [9]:
relevant.head()

Unnamed: 0,rid,pid,rord,srord,text,stext,title,stitle,label
0,1678764,2,1,3,"PNS: A pos, Ab neg, HBSAg neg, RPR NR, RI, GB...","A pos, Ab neg, HBSAg neg, RPR NR, RI, GBS neg....",PNS,pns,183
1,1678764,2,1,6,Assessment/plan:\nTerm male infant with increa...,Term male infant with increased risk of sepsis...,Assessment/plan,assessment/plan,42
2,1260685,3,1,0,MICU NSG PROG NOTE: days\nRemains stable on hi...,"days\nRemains stable on high dose neo, taperin...",MICU NSG PROG NOTE,micu nsg prog note,700
3,1260685,3,1,2,CARDIAC: Maintaining map>60 on neo and levofed...,Maintaining map>60 on neo and levofed. He has ...,CARDIAC,cardiac,17
4,1260685,3,1,3,RESP: good abg. good oxygenation. Strong cough...,good abg. good oxygenation. Strong cough. Has ...,RESP,resp,0


Prepare TaggedDocument

In [10]:
tagged_docs = []

for i, row in relevant.iterrows():
    tokens = simple_preprocess(row['stext'])
    tagged_docs.append(TaggedDocument(words=tokens, tags=[str(i)]))  # unque tag for each row of data(segment)

    # tag: Think of a tag as a unique identifier (or set of identifiers) that represents the document within the model. Doc2Vec needs these tags so it can learn a vector representation (an embedding) per document as well as per word.
    # if we gave label here it will combine labels and create the single embedding for each combined label/labels

In [11]:
print(tagged_docs[:2])


[TaggedDocument(words=['pos', 'ab', 'neg', 'hbsag', 'neg', 'rpr', 'nr', 'ri', 'gbs', 'neg', 'pregnancy', 'was', 'uncomplicated', 'delivery', 'was', 'by', 'section', 'after', 'failure', 'to', 'progress', 'apgars', 'mother', 'was', 'treated', 'with', 'antibiotics', 'because', 'of', 'maternal', 'temp', 'of', 'just', 'prior', 'to', 'delivery', 'mother', 'temp', 'was', 'then', 'lower', 'but', 'at', 'hours', 'rose', 'again', 'to'], tags=['0']), TaggedDocument(words=['term', 'male', 'infant', 'with', 'increased', 'risk', 'of', 'sepsis', 'will', 'check', 'cbc', 'diff', 'and', 'plats', 'blood', 'culture', 'will', 'cover', 'with', 'antibiotics', 'for', 'at', 'least', 'hours', 'pending', 'results', 'of', 'cultures', 'further', 'work', 'up', 'with', 'possible', 'lp', 'if', 'culture', 'is', 'positive', 'or', 'clinical', 'signs', 'of', 'sepsis', 'develop'], tags=['1'])]


Train a Doc2Vec Model

In [13]:
# Initialize model
model = Doc2Vec(
    vector_size=50,
    window=5,
    min_count= 3,  # Set to 1 for small toy data, usually higher like 2 or 5
    workers=16,
    epochs=40,
    dm=1  # Distributed Memory
)

In [14]:
# Build vocabulary
model.build_vocab(tagged_docs)

In [15]:
# Train model
model.train(
    tagged_docs,
    total_examples=model.corpus_count,
    epochs=model.epochs
)

Infer Embeddings for Each Segment (Row)

In [16]:
# Create a new column 'embedding' in df
def infer_embedding(text):
    tokens = simple_preprocess(text)
    return model.infer_vector(tokens)

In [17]:
relevant['embedding'] = relevant['stext'].apply(infer_embedding)

In [18]:
relevant.head()

Unnamed: 0,rid,pid,rord,srord,text,stext,title,stitle,label,embedding
0,1678764,2,1,3,"PNS: A pos, Ab neg, HBSAg neg, RPR NR, RI, GB...","A pos, Ab neg, HBSAg neg, RPR NR, RI, GBS neg....",PNS,pns,183,"[0.43021446, -0.6662299, 1.200484, -0.68246704..."
1,1678764,2,1,6,Assessment/plan:\nTerm male infant with increa...,Term male infant with increased risk of sepsis...,Assessment/plan,assessment/plan,42,"[-0.04399224, -0.59779483, 0.071669385, -0.551..."
2,1260685,3,1,0,MICU NSG PROG NOTE: days\nRemains stable on hi...,"days\nRemains stable on high dose neo, taperin...",MICU NSG PROG NOTE,micu nsg prog note,700,"[0.0025770026, -1.3833206, 0.19549513, -0.4590..."
3,1260685,3,1,2,CARDIAC: Maintaining map>60 on neo and levofed...,Maintaining map>60 on neo and levofed. He has ...,CARDIAC,cardiac,17,"[-0.5564191, 0.41284, 1.6669538, -0.50814295, ..."
4,1260685,3,1,3,RESP: good abg. good oxygenation. Strong cough...,good abg. good oxygenation. Strong cough. Has ...,RESP,resp,0,"[-0.30293128, -0.37995958, 0.49376696, -0.1509..."


In [19]:
def create_name(pre, name, post):
    if name:
        return f"{pre}{name}-{post}"
    return f"{pre}{post}"

In [20]:
name = "c_nurse_doc2vec_embedding"
relevant.reset_index(drop=True).to_feather(create_name("dataset/", name, "parts.feather")) # Save the relevant DataFrame to a Feather file