## Import libraries

In [0]:
from google.colab import auth
import pandas as pd
import numpy as np
from pandas.core.common import flatten

import nltk
import re
import string
import itertools
import pickle

import spacy
nlp = spacy.load('en_core_web_sm')

import nltk
nltk.download('punkt')

from google.colab import drive
drive.mount('/content/drive')

from spacy.matcher import PhraseMatcher
import io
from google.colab import files

## Loading Data


In [0]:
auth.authenticate_user()

#### Mimic Discharge Data

In [0]:
!gsutil cp gs://hst-956/adult_notes.gz ./

In [0]:
umls_terms = pickle.load(open('.drive/My Drive/MLHC Final Project/umls_summary.pk', 'rb'))
discharge_df = pd.read_csv('./drive/My Drive/MLHC Final Project/MIMIC Data - All Discharge Summaries/first_10000_processed_mimic_notes_data.csv') 

In [0]:
# create spacy phrase matcher 
# to do exact matching on MIMIC discharge notes to UMLS terms
def generatePhraseMatcher():
  # read in problem terms in pieces to avoid truncating
  prob_pattern_200 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_200.pickle', 'rb'))
  prob_pattern_400 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_400.pickle', 'rb'))
  prob_pattern_600 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_600.pickle', 'rb'))
  prob_pattern_800 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_800.pickle', 'rb'))

  # add problem pattern phrases to PhraseMatcher
  pm = PhraseMatcher(nlp.vocab, attr = 'LOWER')

  pm.add('problem', None, *prob_pattern_200)
  pm.add('problem', None, *prob_pattern_400)
  pm.add('problem', None, *prob_pattern_600)
  pm.add('problem', None, *prob_pattern_800)

  # delete objects to free up memory
  del prob_pattern_200
  del prob_pattern_400
  del prob_pattern_600
  del prob_pattern_800

  # read in treatment terms in pieces to avoid truncating
  treat_pattern_250 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/treat_pattern_250.pickle', 'rb'))
  treat_pattern_500 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/treat_pattern_500.pickle', 'rb'))
  treat_pattern_750 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/treat_pattern_750.pickle', 'rb'))

  # add treatment pattern phrases to PhraseMatcher
  pm.add('treatment', None, *treat_pattern_250)
  pm.add('treatment', None, *treat_pattern_500)
  pm.add('treatment', None, *treat_pattern_750)

  # delete objects to free up memory
  del treat_pattern_250
  del treat_pattern_500
  del treat_pattern_750

  # read in test terms in pieces to avoid truncating
  test_pattern_50 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/test_pattern_50.pickle', 'rb'))
  test_pattern_100 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/test_pattern_100.pickle', 'rb'))
  # add test pattern phrases to PhraseMatcher
  pm.add('test', None, *test_pattern_50)
  pm.add('test', None, *test_pattern_100)

  # delete objects to free up memory
  del test_pattern_50
  del test_pattern_100

  return(pm)

In [0]:
# Input to this function is a dataframe with an ID column and a column 
# with column name 'SENTENCE'
def apply_matcher(df, phrase_matcher):

  # Create columns with (1) phrase_matcher output, (2) corresponding ids
  # (3) start and end of span of each matched phrase, (4) total sentence length
  df['MATCH'] = df['SENTENCE'].apply(lambda x : phrase_matcher(nlp(x)))
  df['MATCH_IDS'] = df['MATCH'].apply(lambda x : [nlp.vocab.strings[y[0]] for y in list(x)])
  df['SPANS_START'] = df['MATCH'].apply(lambda x : [y[1] for y in list(x)])
  df['SPANS_END'] = df['MATCH'].apply(lambda x : [y[2] for y in list(x)])
  df['SENTENCE_LENGTH'] = df['SENTENCE'].str.split().str.len()

  return df

In [0]:
# Input to this function is a dataframe output from "apply_matcher" function
def apply_labels(df):

  indices_to_keep = []
  labels = []
  index = 0
  
  # Find all indices of words that were matched in the sentence
  for start in list(range(len(df['SPANS_START']))):
    item = list(range(df['SPANS_START'][start],df['SPANS_END'][start]))
    indices_to_keep.append(item)
  df['ALL_INDICES'] = indices_to_keep

  # If no match is found for a word, assign 'O' as the label
  for word in df['SENTENCE'].split():
    if next((i for i, val in enumerate(df['ALL_INDICES']) if index in val),None) == None:
      labels.append('O')
    else:
      match_id_index = next(i for i, val in enumerate(df['ALL_INDICES']) if index in val)
      # If match is found for a word, and its index is the first in its 
      # corresponding matching span, assign 'B-"match_id"' as the label
      if index == df['ALL_INDICES'][match_id_index][0]:
        labels.append("B-" + str(df['MATCH_IDS'][match_id_index]))
      # If match is found for a word, and its index is not the first in its 
      # corresponding matching span, assign 'I-"match_id"' as the label
      else:
        labels.append("I-" + str(df['MATCH_IDS'][match_id_index]))
    index += 1
  
  # Create new column that contains a list of labels for every word in 
  # the sentence
  df['LABELS'] = labels
  return df

  # Usage example: transformDF = matcherDF.apply(apply_labels,axis=1)

In [0]:
# shortens all sentences to the desired length
# sentence with 100 words because 1 sentences with 64 words and one with 36 words
# inputs: string/sentence and max length
# returns list of lists
def getShorterSent(s, n):
    pieces = s.split()
    return list(" ".join(pieces[i:i+n]) for i in range(0, len(pieces), n)) 

# shortens all sentences in a discharge summary
# inputs: discharge note as a string and max sentence length
# returns list of lists -- list of all discharge summary sentences with max length "words"
def preprocess(text, words):
    short_sent = list(map(lambda x: getShorterSent(x, words), text))
    final_list = list(itertools.chain(*short_sent))
    return final_list

# removes unnecessary puctuation and extra spaces
# input string/sentence
# retruns string/sentence
def clean_sentence(x):
  # added to remove all punctuation because of spacy processing
  x_clean = x.translate(str.maketrans('', '', string.punctuation))
  x_clean = x_clean.strip()
  x_clean = re.sub(' +', ' ', x_clean)
  return x_clean

# df should be the icustay_id and notes column from the original MIMIC dataset
# labeled ids and notes
# words = max sentence length
def getLabeledDF(df, phrase_matcher, words):

  # preprocesses dataframe by shortening sentences to the maximum desired length
  sent_listed = pd.DataFrame({
    'ids': df.ids, 
    'notes': df.notes.apply(lambda x: str(x).split('\n')).apply(lambda x: preprocess(x, words))
    })
  
  # puts each sentence in its own row with corresponding icustay_id 
  melted_df = sent_listed.notes.apply(pd.Series) \
    .merge(sent_listed, right_index = True, left_index = True) \
    .drop(["notes"], axis = 1) \
    .melt(id_vars = ['ids'], value_name = "notes").sort_values(['ids', 'variable']) \
    .dropna() 

  #free memory
  del sent_listed

  #formate the melted dataframe for apply_matcher function
  melted_df = melted_df.rename(columns={'ids':'ID', 'notes':'SENTENCE'})
  melted_df.SENTENCE = melted_df.SENTENCE.apply(clean_sentence)

  # apply matcher function
  ready_to_label = apply_matcher(melted_df, phrase_matcher)

  del melted_df
  # using above output generate the list of the actual labels
  transformDF = ready_to_label.apply(apply_labels,axis=1) 
  del ready_to_label

  return(transformDF)


In [0]:
# split sentences into words
def splitSentence(x):
  return x.split(' ') 

# generate final csv for model training
# num_notes = number of discharge notes
# name is file name
# words is max sentence length
def generateCSV(num_notes, name, words):
  # file path
  file_name = './drive/My Drive/MLHC Final Project/Final Preprocessed Data/' + name
  # determines the number of notes in the final csv
  # -1 will indicate to use all of the discharge notes
  if num_notes == -1:
    notes_to_try = pd.DataFrame({
      'ids': discharge_df.ROW_ID, 
      'notes': discharge_df.TEXT
    })
  # only uses a specified subset of 1000 notes
  else:
    notes_to_try = pd.DataFrame({
      'ids': discharge_df.ROW_ID[(1000*num_notes):(1000*(num_notes+1))], 
      'notes': discharge_df.TEXT[(1000*num_notes):(1000*(num_notes+1))]
    })
  # generate spacy phrase matcher object
  pm = generatePhraseMatcher()

  # generate dataframe
  df = getLabeledDF(notes_to_try, pm, words)

  del notes_to_try

  # split sentences into words
  df.SENTENCE = df.SENTENCE.apply(splitSentence)

  # generates a dataframe with each word in its own row
  # overall_sent is unique
  # id is the original note ID
  # SENTENCE_ID is the sentence number within that note
  note_id = df[['ID', 'SENTENCE', 'LABELS']].set_index(['ID']).apply(pd.Series.explode).reset_index()
  sent_id = df[['variable', 'SENTENCE', 'LABELS']].set_index(['variable']).apply(pd.Series.explode).reset_index()
  final_df = pd.concat([sent_id['variable'], note_id], axis=1).rename(columns = {'ID':'NOTE_ID', 'SENTENCE':'WORD', 'variable':'SENTENCE_ID'})
  final_df['overall_sent'] = final_df.groupby(['NOTE_ID', 'SENTENCE_ID']).ngroup()

  del df
  del note_id
  del sent_id
  # final_df.to_csv(file_name, header=True)

  return(final_df) 


In [0]:
# create object to set a timer on runtime
import signal
class TimeoutException(Exception):   # Custom exception class
    pass
def timeout_handler(signum, frame):   # Custom signal handler
    raise TimeoutException
# Change the behavior of SIGALRM
signal.signal(signal.SIGALRM, timeout_handler)

In [0]:
# function to run generatCSV over groupd of 1000
def loopThrough(start, stop, num):
  temp = pd.DataFrame()
  for i in range(start, stop):
    signal.alarm(3600)
    try:
      if i == start:
        temp = generateCSV(i, '50_notes_discharge_umls_clean_length_126.csv', num)
      else: 
        temp = pd.concat([temp, generateCSV(i, '50_notes_discharge_umls_clean_length_126.csv', num)])
    except TimeoutException:
        continue # continue the for loop if function A takes more than 5 second
    else:
        # Reset the alarm
        signal.alarm(0)
  return(temp)

In [0]:
import time
# just to time how long it takes
start_time = time.time()

# first number is your starting point -- the function reads sets of 1000 
# Example: 0, 2 --> 2000 notes --> 0:1000, 1000:2000
# final number is how long the max sentence length should be
# the function returns these dataframes concatenated together
df_for_modeling = loopThrough(1, 3, 126)

# print out runtime
print("--- %s seconds ---" % (time.time() - start_time))

In [0]:
# appends the column with unique sentence id's
df_for_modeling['overall_sent'] = df_for_modeling.groupby(['NOTE_ID', 'SENTENCE_ID']).ngroup()
# resets the index
df_for_modeling = df_for_modeling.reset_index()

In [0]:
# finally write the file to a csv
df_for_modeling.dropna().to_csv('./drive/My Drive/MLHC Final Project/Final Preprocessed Data/2000_notes_discharge_umls_clean_length_126.csv', header=True)