# KWIC FOR INTERPRETATION

## SETUP

In [6]:
!pip install textacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting textacy
  Downloading textacy-0.12.0-py3-none-any.whl (208 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.4/208.4 KB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting cytoolz>=0.10.1
  Downloading cytoolz-0.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyphen>=0.10.0
  Downloading pyphen-0.13.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
Collecting jellyfish>=0.8.0
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 KB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building whe

In [2]:
# import all required modules
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)  ###
pd.set_option('display.max_columns', None)  ###
pd.set_option('display.width', None)  ###
pd.set_option('display.max_colwidth', None)  ###

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
import pickle
import random
import re
import textacy

if textacy.__version__ < '0.11': # as in printed book
    from textacy.text_utils import KWIC
    
else: # for textacy 0.11.x
    from textacy.extract.kwic import keyword_in_context

    def KWIC(*args, **kwargs):
        # call keyword_in_context with all params except 'print_only'
        return keyword_in_context(*args, 
                           **{kw: arg for kw, arg in kwargs.items() 
                            if kw != 'print_only'})

In [3]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/MyDrive/MIMIC-III Text Mining/LOS_FINAL/"

else:
  # Setup Repository
  with open("repo_info.txt", "r") as repo_info:
      path_to_repo = repo_info.readline()

  
print(path_to_repo)

path_to_data = f"{path_to_repo}data/"
path_to_raw = f"{path_to_data}raw/"
path_to_processed = f"{path_to_data}processed/"
path_to_lda = f"{path_to_data}lda/"
path_to_icd = f"{path_to_data}icd_codes/"
path_to_models = f"{path_to_repo}models/"
path_to_results = f"{path_to_repo}results/"

We're running Colab
Colab: mounting Google drive on  /content/gdrive
Mounted at /content/gdrive
/content/gdrive/MyDrive/MIMIC-III Text Mining/LOS_FINAL/


## LOAD OUR DATASET

In [5]:
preproc_tag = '_preproc_heavier'
lemma_tag = "_lemma_spacy"
df = pd.read_feather(f'{path_to_processed}df_los{preproc_tag}{lemma_tag}')
# restrict to just discharge notes
df = df[['los_cat', 'text']]
print('Dataframe Loaded')
# split the data into training and test
train, test = train_test_split(df, train_size=0.80, stratify = df['los_cat'], random_state=42)

Dataframe Loaded


## APPLY KWIC

In [9]:
def kwic(doc_series, keyword, window=35, print_samples=5):

    def add_kwic(text):
        kwic_list.extend(KWIC(text, keyword, ignore_case=True, 
                              window_width=window, print_only=False))

    kwic_list = []
    doc_series.progress_map(add_kwic)

    if print_samples is None or print_samples==0:
        return kwic_list
    else:
        k = min(print_samples, len(kwic_list))
        print(f"{k} random samples out of {len(kwic_list)} " + \
              f"contexts for '{keyword}':")
        for sample in random.sample(list(kwic_list), k):
            print(re.sub(r'[\n\t]', ' ', sample[0])+'  '+ \
                  sample[1]+'  '+\
                  re.sub(r'[\n\t]', ' ', sample[2]))

In [10]:
from tqdm import tqdm

tqdm.pandas()  # <- added this line

In [34]:
kwic(test['text'], '\sed\s', print_samples=100)

100%|██████████| 6153/6153 [00:00<00:00, 7193.47it/s]

100 random samples out of 6307 contexts for '\sed\s':
e stool past five day last loose bm   ed   patient explain sometimes get confu
ial vital ed 976 98 12470 24 99 lab   ed   notable wbc 157 941n hct 379 plt 33
0 lethargy w sob send ed evaluation   ed   note black guiaic pos stool dark ng
lication feel well enough walk home   ed   initial vital 1003 bp 11289 hr 94 r
 illness ms 88 yo f htn hld present   ed   chest back pain per pts family epis
sob say resolve report ativan lasix   ed   improve cp sob bedside tee perform 
r evaluate obgyn hematology service   ed   feel process unrelated retain poc p
ailure cr 52 baseline cr20 transfer   ed   vq scan rule pulmonary emboli give 
dmit acute psychotic decompensation   ed   initial vs 98 p 144 bp 134881 r 33 
c total make 650cc uop transfer icu   ed   respiratory status stable still nit
hct 24 transfer bc icu bed per note   ed   patient unable give much history du
 infection forehead improve present   ed   able get chair weakness see ed ini


