# KWIC FOR INTERPRETATION

## SETUP

In [23]:
# import all required modules
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from tqdm import tqdm

tqdm.pandas()  # <- added this line

pd.set_option('display.max_rows', None)  ###
pd.set_option('display.max_columns', None)  ###
pd.set_option('display.width', None)  ###
pd.set_option('display.max_colwidth', None)  ###

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
import pickle
import random
import re
try:
  import textacy
except:
  !pip install textacy
  import textacy

if textacy.__version__ < '0.11': # as in printed book
    from textacy.text_utils import KWIC

else: # for textacy 0.11.x
    from textacy.extract.kwic import keyword_in_context

    def KWIC(*args, **kwargs):
        # call keyword_in_context with all params except 'print_only'
        return keyword_in_context(*args,
                           **{kw: arg for kw, arg in kwargs.items()
                            if kw != 'print_only'})

In [4]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/MyDrive/MIMIC-III Text Mining/LOS_FINAL/"

else:
  # Setup Repository
  with open("repo_info.txt", "r") as repo_info:
      path_to_repo = repo_info.readline()


print(path_to_repo)

path_to_data = f"{path_to_repo}data/"
path_to_raw = f"{path_to_data}raw/"
path_to_processed = f"{path_to_data}processed/"
path_to_lda = f"{path_to_data}lda/"
path_to_icd = f"{path_to_data}icd_codes/"
path_to_models = f"{path_to_repo}models/"
path_to_results = f"{path_to_repo}results/"

We're running Colab
Colab: mounting Google drive on  /content/gdrive
Mounted at /content/gdrive
/content/gdrive/MyDrive/MIMIC-III Text Mining/LOS_FINAL/


## LOAD OUR DATASET

In [16]:
preproc_tag = '_preproc_heavier'
lemma_tag = "_lemma_spacy"
df = pd.read_feather(f'{path_to_processed}df_los{preproc_tag}{lemma_tag}')
# restrict to just discharge notes
df = df[['los_cat', 'text']]
print('Dataframe Loaded')
# split the data into training and test
_, test = train_test_split(df, train_size=0.80, stratify = df['los_cat'], random_state=42)


preproc_tag = '_preproc_heavier'
lemma_tag = ""
df_stem = pd.read_feather(f'{path_to_processed}df_los{preproc_tag}{lemma_tag}')
# restrict to just discharge notes
df_stem = df_stem[['los_cat', 'text']]
print('Dataframe Loaded')
# split the data into training and test
_, test_stem = train_test_split(df_stem, train_size=0.80, stratify = df_stem['los_cat'], random_state=42)

Dataframe Loaded
Dataframe Loaded


## APPLY KWIC

In [19]:
def kwic(doc_series, keyword, window=35, print_samples=5):

    def add_kwic(text):
        kwic_list.extend(KWIC(text, keyword, ignore_case=True,
                              window_width=window, print_only=False))

    kwic_list = []
    doc_series.progress_map(add_kwic)

    if print_samples is None or print_samples==0:
        return kwic_list
    else:
        k = min(print_samples, len(kwic_list))
        print(f"{k} random samples out of {len(kwic_list)} " + \
              f"contexts for '{keyword}':")
        for sample in random.sample(list(kwic_list), k):
            print(re.sub(r'[\n\t]', ' ', sample[0])+'  '+ \
                  sample[1]+'  '+\
                  re.sub(r'[\n\t]', ' ', sample[2]))

In [29]:
def kwic_list(doc_series, keyword, window=35):

    def add_kwic(text):
        kwic_list.extend(KWIC(text, keyword, ignore_case=True,
                              window_width=window, print_only=False))

    kwic_list = []
    doc_series.progress_map(add_kwic)
    return kwic_list

In [21]:
def get_top_words(kwic_list):
  store1 = ""
  store2 = ""
  for i, _ in enumerate(kwic_list):
    store1 = store1 + kwic_list[i][0]
    store2 = store2 + kwic_list[i][2]
  store1 = store1.split(" ")
  store2 = store2.split(" ")
  return Counter(store1), Counter(store2)

### Histori/History

In [27]:
# Get most common words following/preceding a word
kwic_top = kwic_list(test['text'], '\shistory\s')
c1, c2 = get_top_words(kwic_top)
kwic_words = pd.DataFrame({'Preceding Words': c1.most_common(20), 'Following Words': c2.most_common(20)})
kwic_words

100%|██████████| 6153/6153 [00:00<00:00, 6783.93it/s]


Unnamed: 0,Preceding Words,Following Words
0,"(past, 8793)","(illness, 6564)"
1,"(history, 3891)","(history, 3912)"
2,"(procedure, 2017)","(physical, 3233)"
3,"(year, 1915)","(exam, 2923)"
4,"(medical, 1719)","(year, 2764)"
5,"(social, 1611)","(sp, 1930)"
6,"(family, 1574)","(yo, 1757)"
7,"(use, 1513)","(patient, 1749)"
8,"(drug, 1490)","(disease, 1643)"
9,"(invasive, 1466)","(old, 1502)"


In [32]:
# Get most common words following/preceding a word
kwic_top = kwic_list(test_stem['text'], '\shistori\s')
c1, c2 = get_top_words(kwic_top)
kwic_words = pd.DataFrame({'Preceding Words': c1.most_common(20), 'Following Words': c2.most_common(20)})
kwic_words

100%|██████████| 6153/6153 [00:00<00:00, 9619.78it/s]


Unnamed: 0,Preceding Words,Following Words
0,"(past, 8935)","(ill, 6623)"
1,"(histori, 4278)","(histori, 4298)"
2,"(procedur, 2262)","(physic, 3578)"
3,"(year, 2006)","(exam, 3214)"
4,"(medic, 2005)","(year, 2853)"
5,"(invas, 1852)","(sp, 2115)"
6,"(surgic, 1747)","(live, 2069)"
7,"(famili, 1712)","(present, 1906)"
8,"(social, 1709)","(patient, 1876)"
9,"(deni, 1601)","(diseas, 1826)"


In [None]:
kwic(test['text'], '\shistory\s', print_samples=100)

100%|██████████| 6922/6922 [00:01<00:00, 4576.62it/s]

100 random samples out of 44519 contexts for '\shistory\s':
il summary 87yearold lefthanded man   history   hypertension recent pulmonary embol
co drug abuse currently past family   history   fh diabetes type ii several family 
re ercp x2 intubation cvl placement   history   present illness 74 yo vietnamese sp
r seem pure hemorrhage past medical   history   ivda methadone x 10 month social hi
or surgical invasive procedure none   history   present illness cc fall hpi yo man 
ocedure egd leave ig line placement   history   present illness yr old female past 
ium albumin 25gm iv x1 past medical   history   know cad sp diskectomy sp tahbso ao
 syndrome hypothyroid past surgical   history   sp sigmoid colectomy w pouch cervic
perglycemia elevate cr past medical   history   pmhx 1 heart block junctional rhyth
ive procedure right hemicraniectomy   history   present illness pt 78 yearold right
lergie lasix attend chief complaint   history   present illness per ed staff note p
 lift recent sin




In [None]:
kwic(test_stem['text'], '\shistori\s', print_samples=100)

100%|██████████| 6922/6922 [00:01<00:00, 5899.25it/s]

100 random samples out of 44519 contexts for '\shistori\s':
 atrium without thrombus past medic   histori   signific hypertens mitral regurgit 
mcv98 mch324 mchc330 rdw147 ct head   histori   left mca stroke hemorrhag transform
t self detox humerus fractur social   histori   grew 3 sibl antiqu galleri live lon
ori anxieti depress also deni prior   histori   psychosi mania confirm depend opiat
 smoke occasion alcohol drug famili   histori   famili histori brother heart proble
esid fl brotherwhom resid oh famili   histori   noncontributori physic exam admiss 
bstruct 3 liver biopsi trucut needl   histori   present ill pt 39f develop central 
re metal stent place svgtolad graft   histori   present ill 71 yo w pmh stabl angin
ceiv medic care primarili bw famili   histori   noncontributori physic exam vital 9
ri present ill mrs 80 yearold femal   histori   chronic af diastol chf dm hyperlipi
rief hospit cours 48 year old femal   histori   hiv last cd4 114 hcv polysubst abus
iac recan stent 




### "1"

In [None]:
# Get most common words following/preceding a word
kwic_top = kwic_list(test['text'], '\s1\s', print_samples=100)
c1, c2 = get_top_words(kwic_top)
print(c1.most_common(20))
print(c2.most_common(20))

100%|██████████| 6922/6922 [00:01<00:00, 3844.87it/s]


[('sig', 41019), ('mg', 40261), ('tablet', 30706), ('one', 10074), ('1', 8731), ('discharg', 8320), ('2', 6684), ('medic', 6521), ('capsul', 6459), ('100', 4888), ('5', 4816), ('10', 4467), ('relea', 4466), ('daili', 4013), ('day', 3964), ('po', 3926), ('4', 3684), ('delay', 3488), ('20', 3303), ('3', 3178)]
[('po', 39858), ('daili', 26980), ('day', 24052), ('2', 19145), ('tablet', 13483), ('time', 11758), ('mg', 11114), ('1', 8829), ('bid', 7494), ('need', 7005), ('everi', 6729), ('3', 6660), ('hour', 5781), ('sig', 5379), ('4', 5375), ('relea', 4638), ('6', 4600), ('one', 4066), ('delay', 3912), ('5', 3736)]


In [None]:
kwic(test['text'], '\s1\s', print_samples=100)

100%|██████████| 6922/6922 [00:01<00:00, 3689.99it/s]

100 random samples out of 90778 contexts for '\s1\s':
acetaminophen 500 mg tablet sig one   1   tablet po q6h everi 6 hour 8 oxycod
 day 11 multivitamin tablet sig one   1   tablet po daili daili 12 sucralf 1 
nna mvi discharg medic 1 folic acid   1   mg tablet sig one 1 tablet po daili
ardiocentesi abort assess recommend   1   mix cardiogen vasodilatori shock 2 
eurosurgeri followup ct head within   1   month discharg day followup appoint
 docus sodium 100 mg capsul sig one   1   capsul po bid 2 time day need const
oin cathet sheath intact distal pul   1   extrem warm neuro nonfoc pertin res
ic 1 furosemid 20 mg tablet sig one   1   tablet po q12h everi 12 hour 7 day 
 tablet least 5 year discharg medic   1   ultram 50 mg tablet sig one 1 table
 18 mcg capsul winhal devic sig one   1   cap inhal daili daili 14 metoprolol
spirin 81 mg tablet chewabl sig one   1   tablet chewabl po daili daili disp3
 occlus aneurysm great 3 mm impress   1   evid acut intracrani hemorrhag terr
2 leave kn




### ed

In [None]:
kwic(test['text'], '\sed\s', print_samples=100)

100%|██████████| 6153/6153 [00:00<00:00, 7193.47it/s]

100 random samples out of 6307 contexts for '\sed\s':
e stool past five day last loose bm   ed   patient explain sometimes get confu
ial vital ed 976 98 12470 24 99 lab   ed   notable wbc 157 941n hct 379 plt 33
0 lethargy w sob send ed evaluation   ed   note black guiaic pos stool dark ng
lication feel well enough walk home   ed   initial vital 1003 bp 11289 hr 94 r
 illness ms 88 yo f htn hld present   ed   chest back pain per pts family epis
sob say resolve report ativan lasix   ed   improve cp sob bedside tee perform 
r evaluate obgyn hematology service   ed   feel process unrelated retain poc p
ailure cr 52 baseline cr20 transfer   ed   vq scan rule pulmonary emboli give 
dmit acute psychotic decompensation   ed   initial vs 98 p 144 bp 134881 r 33 
c total make 650cc uop transfer icu   ed   respiratory status stable still nit
hct 24 transfer bc icu bed per note   ed   patient unable give much history du
 infection forehead improve present   ed   able get chair weakness see ed ini


