The goal of this notebook is to explore the contexts in which matches from our keyword lists occur, and to see if there are any contexts that are worth trying to exclude (e.g., "family history") to make the annotation review process quicker.

In [80]:
import numpy as np
import pandas as pd
import re

import plotly.express as px

from sklearn.feature_extraction.text import CountVectorizer

from cleaning.text_preprocessing import preprocess_text

We will use the most recently expanded cohort.

In [47]:
df = pd.read_csv('../data/interim/2020-06-03-em-investigational_regex_results.csv')
df.head(3)

Unnamed: 0,ICUSTAY_ID,HADM_ID,SUBJECT_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ETHNICITY,HOSPITAL_EXPIRE_FLAG,GENDER,DOD,...,ADMISSION_FLUID_ELECTROLYTE,ADMISSION_BLOOD_LOSS_ANEMIA,ADMISSION_DEFICIENCY_ANEMIAS,ADMISSION_ALCOHOL_ABUSE,ADMISSION_DRUG_ABUSE,ADMISSION_PSYCHOSES,ADMISSION_DEPRESSION,VASOPRESSOR_DURATION_HOURS,CHILD,SPOUSE
0,200053,100696,78895,2166-02-25 07:15:00,2166-03-14 13:50:00,ELECTIVE,OTHER,0,M,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,0
1,200053,100696,78895,2166-02-25 07:15:00,2166-03-14 13:50:00,ELECTIVE,OTHER,0,M,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,1
2,200053,100696,78895,2166-02-25 07:15:00,2166-03-14 13:50:00,ELECTIVE,OTHER,0,M,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1,0


In [48]:
def regex_match_with_window(pattern, text, window_size):
    '''Returns list of dictionaries containing information about
       the match results for pattern in text, including the
       surrounding words within a given window size.

       Output format is: {'context_before': str,
                          'pattern': pattern,
                          'context_after': str,
                          'span': tuple}
    '''

    full_pattern = r'(?P<context_before>(?:\S+\s+){{0,{window_size}}})'\
                   r'(?P<pattern>(?:{pattern}))'\
                   r'(?P<context_after>(?:\s+\S+){{0,{window_size}}})'
    
    full_pattern = full_pattern.format(pattern=pattern,
                                       window_size=window_size)

    matches = re.compile(full_pattern).finditer(text)
    
    matches_with_context = [{**m.groupdict(),
                             **{'span': m.span()}} for m in matches]

    return matches_with_context

## Spouse/Partner

In [49]:
SPOUSE_REGEX = r'(ex(-?))?(wi(f|v)e|husband|partner|spouse|significant other|s/o|(girl|boy)friend|fiance(e?)|companion)(\'?e?s?\'?)'

In [62]:
window_size = 8

df_small = df.sample(n=2000, random_state=1)
df_small['REGEX_MATCH_SPOUSE_W_{}'.format(window_size)] = df_small['TEXT'].apply(preprocess_text)\
                                                                          .apply(lambda text: regex_match_with_window(SPOUSE_REGEX, text, window_size))

In [63]:
spouse_matches = pd.DataFrame.from_records(df_small['REGEX_MATCH_SPOUSE_W_{}'.format(window_size)].sum())
spouse_matches

Unnamed: 0,context_before,pattern,context_after,span
0,family made pt on return from seeing pts,wife,settled back on fentanyl and midazalam drip off,"(99, 192)"
1,code dnr dispo icu comm with patient s,wife,home cell icu care nutrition glycemic control...,"(4420, 4515)"
2,lips and handwriting oriented x expressing con...,husband,and son sister visited today and explained to,"(645, 754)"
3,patient,husband,and son support held this morning as it,"(755, 810)"
4,stress ulcer ppi vap comments communication co...,husband,status full code disposition,"(4944, 5038)"
...,...,...,...,...
401,and picc communication hm wk discussed plan with,partner,chip today disposition goal is to lower peep,"(7843, 7944)"
402,elevation comments communication hm wk discuss...,partner,chip today code status full code disposition ...,"(8515, 8631)"
403,may dicuss contact ip for ct placement social,wife,updated by resident dr she does plan on,"(1141, 1231)"
404,code full confirmed on admit to icu communicat...,wife,hcp family meeting want aggressive management...,"(5247, 5362)"


In [64]:
spouse_matches['pattern'].value_counts()

wife                 236
husband              116
girlfriend            22
partner               20
significant other      4
boyfriend              4
spouse                 4
Name: pattern, dtype: int64

In [65]:
def plot_ngram_counts(texts, n=1, k=20, stop_words=None):
    cv = CountVectorizer(ngram_range=(n, n),
                         stop_words=stop_words)
    cv_fit = cv.fit_transform(texts)
    
    words = cv.get_feature_names()
    counts = np.asarray(cv_fit.sum(axis=0))[0]
    
    df = pd.DataFrame.from_dict(dict(zip(words, counts)), orient='index', columns=['count'])\
                     .rename_axis('{}-gram'.format(n))\
                     .reset_index()\
                     .sort_values(by='count', ascending=True, ignore_index=True)\
                     .tail(k) # only show top k
    
    return px.bar(df, x='count', y='{}-gram'.format(n), height=600, title=texts.name)

### Wife

In [66]:
spouse_matches[spouse_matches['pattern'] == 'wife']

Unnamed: 0,context_before,pattern,context_after,span
0,family made pt on return from seeing pts,wife,settled back on fentanyl and midazalam drip off,"(99, 192)"
1,code dnr dispo icu comm with patient s,wife,home cell icu care nutrition glycemic control...,"(4420, 4515)"
5,mg iv q h vap bundle comments communication,wife,code status full code disposition icu for now,"(8191, 8285)"
6,brother also gave permission to speak with his,wife,icu care nutrition glycemic control lines mul...,"(4052, 4157)"
7,with hiss fs social dnr dni lives with,wife,who is very hoh for phone consents please,"(1449, 1534)"
...,...,...,...,...
398,give bowel meds prn code full confirmed with,wife,comm with patient and family wife is disp,"(6834, 6925)"
399,ulcer ppi vap vap bundle chlorhexidine communi...,wife,status dnr do not resuscitate disposition icu,"(2982, 3090)"
403,may dicuss contact ip for ct placement social,wife,updated by resident dr she does plan on,"(1141, 1231)"
404,code full confirmed on admit to icu communicat...,wife,hcp family meeting want aggressive management...,"(5247, 5362)"


In [67]:
plot_ngram_counts(spouse_matches[spouse_matches['pattern'] == 'wife']['context_before'], 2)

In [68]:
plot_ngram_counts(spouse_matches[spouse_matches['pattern'] == 'wife']['context_after'], 2)

In [23]:
plot_ngram_counts(spouse_matches[spouse_matches['pattern'] == 'wife'][['context_before', 'context_after']].agg(' '.join, axis=1).rename('context_combined'), 2)

### Husband

In [11]:
spouse_matches[spouse_matches['pattern'] == 'husband']

Unnamed: 0,context_before,pattern,context_after,span
2,lips and handwriting oriented x expressing con...,husband,and son sister visited today and explained to,"(645, 754)"
3,patient,husband,and son support held this morning as it,"(755, 810)"
4,stress ulcer ppi vap comments communication co...,husband,status full code disposition,"(4944, 5038)"
10,know if there are other things to order,husband,not around sister updated did not get a,"(538, 625)"
15,if pt continues to improve ineffective coping ...,husband,has not been in to visit since sunday,"(959, 1061)"
...,...,...,...,...
217,piv dc d as routine change patient s,husband,in for visit updates given follow up electrol...,"(2089, 2183)"
218,ssri left leg erythema no change chronic per,husband,culture resolved new is post trach placement ...,"(3223, 3326)"
222,goals of care discussion today with patient and,husband,has shown some response to therapy on basis,"(5419, 5518)"
227,addendum family meetings i spoke with pt s,husband,and then again with about her respiratory status,"(1877, 1976)"


In [12]:
plot_ngram_counts(spouse_matches[spouse_matches['pattern'] == 'husband']['context_before'], 2)

In [13]:
plot_ngram_counts(spouse_matches[spouse_matches['pattern'] == 'husband']['context_after'], 2)

In [24]:
plot_ngram_counts(spouse_matches[spouse_matches['pattern'] == 'husband'][['context_before', 'context_after']].agg(' '.join, axis=1).rename('context_combined'), 2)

### Partner

In [408]:
spouse_matches[spouse_matches['pattern'] == 'partner']

Unnamed: 0,context_before,pattern,context_after,span
28,up with culture data of note patient s,partner,stated that he has developed fevers in the,"(644, 733)"
79,and ms he neurologist was called by her,partner,and eval was recommended in the osh she,"(703, 790)"
80,piv s code status presumed full emergency cont...,partner,son disposition icu care nutrition glycemic c...,"(5817, 5932)"
85,vap standard ventilator care comments communic...,partner,pt status full code disposition icu for now,"(4628, 4745)"
99,methylprednisolone tx afib with rvr per discus...,partner,patient has not had this previously most likely,"(3946, 4057)"
113,per his case manager through inn and his,partner,he has been sick for the last two,"(198, 280)"
114,that was unrevealing both his case manager and,partner,last saw him three days prior to admission,"(511, 608)"
115,for confusion neither his case manager nor his,partner,are aware of his home medications in the,"(776, 871)"
116,knows the month day yr and knew his,partner,but is still somewhat confused has poor memory,"(2406, 2496)"
140,patient is married she does have an abusive,partner,but states that she feels safe at home,"(3554, 3644)"


In [409]:
plot_ngram_counts(spouse_matches[spouse_matches['pattern'] == 'partner']['context_before'], 2)

In [410]:
plot_ngram_counts(spouse_matches[spouse_matches['pattern'] == 'partner']['context_after'], 2)

In [25]:
plot_ngram_counts(spouse_matches[spouse_matches['pattern'] == 'partner'][['context_before', 'context_after']].agg(' '.join, axis=1).rename('context_combined'), 2)

### Other Spouse/Partner Keywords

In [411]:
spouse_matches[spouse_matches['pattern'] == 'significant other']

Unnamed: 0,context_before,pattern,context_after,span
9,expected to visit today call brother with upda...,significant other,brother icu care nutrition glycemic control l...,"(5551, 5674)"
193,stable cr monitor dm iss code full contact,significant other,dispo icu icu care nutrition nutren pulmonary...,"(7628, 7739)"
194,dispo icu until extubated and stable comm sister,significant other,icu care nutrition glycemic control lines gau...,"(3539, 3656)"


In [412]:
spouse_matches[spouse_matches['pattern'] == 'spouse']

Unnamed: 0,context_before,pattern,context_after,span
97,this time social pt s has large family,spouse,and siblings dtr appropriately emotional and ...,"(2644, 2753)"
130,inneffective family coping r t disease process...,spouse,tearful at bedside some discord evident betwe...,"(1294, 1416)"
131,and,spouse,action offered support to family social worke...,"(1417, 1483)"


In [413]:
spouse_matches[spouse_matches['pattern'] == 'boyfriend']

Unnamed: 0,context_before,pattern,context_after,span
190,hospital patient fell down stairs at home per,boyfriend,pt with fracture and placed in torso brace,"(35, 133)"
191,pt on log roll precautions for spine per,boyfriend,tonight he reported that for the last two,"(351, 443)"


In [414]:
spouse_matches[spouse_matches['pattern'] == 'girlfriend']

Unnamed: 0,context_before,pattern,context_after,span
20,binge and now change in mental status his,girlfriend,called ems on after she went to his,"(187, 275)"
21,of this she brought him to ed his,girlfriend,also reports that he routinely binges on oxyc...,"(602, 697)"
22,from er history obtained from medical records ...,girlfriend,unable to provide history sedated allergies s...,"(2105, 2223)"
23,up unable to obtain occupation unemployed drug...,girlfriend,has had a problem with binging on prescription,"(2988, 3097)"
24,alprazolam and clonazepam tobacco occasional c...,girlfriend,uses occasional alcohol other has a daughter ...,"(3162, 3286)"
25,his longterm,girlfriend,review of systems unable to obtain flowsheet ...,"(3287, 3360)"
62,today but is net positive liters for los,girlfriend,called this am and updated on pt status,"(866, 957)"
152,anything beyond limited xrt course code full c...,girlfriend,dispo icu xrt today icu care nutrition replete,"(4042, 4152)"
174,full confirmed with mother hcp comm mother c,girlfriend,disp icu altered mental status not delirium f...,"(5207, 5312)"
208,line r picc code presumed full communication hcp,girlfriend,x disposition icu for now respiratory failure...,"(3783, 3894)"


## Child

In [71]:
CHILD_REGEX = r'(grand)?(child|son|daughter|kid|teen|boy|girl|y/o|year(-|\s)old)(ren)?(\'?e?s?\'?)'

In [73]:
window_size = 8

df_small = df.sample(n=2000, random_state=1)
df_small['REGEX_MATCH_CHILD_W_{}'.format(window_size)] = df_small['TEXT'].apply(preprocess_text)\
                                                                         .apply(lambda text: regex_match_with_window(CHILD_REGEX, text, window_size))

In [74]:
child_matches = pd.DataFrame.from_records(df_small['REGEX_MATCH_CHILD_W_{}'.format(window_size)].sum())
child_matches

Unnamed: 0,context_before,pattern,context_after,span
0,of foley trauma monitor hct will ultrasound bl...,kid,,"(3060, 3115)"
1,,son,for micu admission s p intubation for airway,"(46, 94)"
2,,sons,hypoventilating from delta ms doing well on ac,"(4807, 4858)"
3,,son,,"(39, 42)"
4,,sone,mg po daily budesonide mg po tid amitriptyline,"(2325, 2376)"
...,...,...,...,...
1476,hysterectomy when she was found down by her,son,and it is believed she has been down,"(193, 277)"
1477,gram stain assessment and plan pt is a,year old,male with many medical problems admitted init...,"(1859, 1962)"
1478,mg dl assessment and plan assessment and plan,year old,male with h o hepatitis c and head,"(1581, 1670)"
1479,sah multiple septic infarcts to brain liver sp...,kid,,"(1812, 1866)"


In [75]:
child_matches['pattern'].value_counts()

year old         397
son              396
sone             219
daughter         148
kid              148
sons              43
teens             30
children          27
girl              25
daughters         22
child              8
boy                6
granddaughter      4
grandchildren      3
grandson           3
kids               1
teen               1
Name: pattern, dtype: int64

### Son

In [76]:
child_matches[child_matches['pattern'] == 'son']

Unnamed: 0,context_before,pattern,context_after,span
1,,son,for micu admission s p intubation for airway,"(46, 94)"
3,,son,,"(39, 42)"
5,,son,,"(3164, 3167)"
7,,son,,"(7188, 7191)"
13,,son,,"(1646, 1649)"
...,...,...,...,...
1459,,son,lung sounds rhoncherous to upper lung fields bil,"(724, 776)"
1463,,son,for admission respratory failure sepsis hour ...,"(37, 99)"
1464,md intubated for apnea de sat central apnea,son,requesting possible nursing home currently ad...,"(366, 476)"
1474,daily wakeup in am and have pt s,son,converse with pt as she speaks portuguese only,"(1143, 1226)"


In [77]:
plot_ngram_counts(child_matches[child_matches['pattern'] == 'son']['context_before'], 2)

In [79]:
plot_ngram_counts(child_matches[child_matches['pattern'] == 'son']['context_after'], 2)

In [81]:
plot_ngram_counts(child_matches[child_matches['pattern'] == 'son'][['context_before', 'context_after']].agg(' '.join, axis=1).rename('context_combined'), 2)