In [0]:
#@title
from google.colab import auth
import pandas as pd
import numpy as np
from pandas.core.common import flatten

In [0]:
import nltk
import re
import string
import itertools
import pickle

In [0]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [0]:
import nltk
nltk.download('punkt')

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
from spacy.matcher import PhraseMatcher
import io
from google.colab import files

In [0]:
auth.authenticate_user()

## Generating List of Words for Exact Matching


In [0]:
# read in UMLS terms
umls_terms = pickle.load(open('./drive/My Drive/MLHC Final Project/umls_summary.pk', 'rb'))

In [0]:
# semantic types to exact match with
med_problems = ['Acquired Abnormality', 'Anatomical Abnormality', 'Cell or Molecular Dysfunction',
               'Congenital Abnormality', 'Disease or Syndrome', 'Experimental Model of Disease',
               'Finding', 'Injury or Poisoning', 'Mental or Behavioral Dysfunction',
               'Neoplastic Process', 'Pathologic Function', 'Sign or Symptom',
               'Organ or Tissue Function']
treatments = ['Therapeutic or Preventive Procedure', 'Clinical Drug', 'Health Care Activity']
tests = ['Diagnostic Procedure', 'Laboratory Procedure', 'Laboratory or Test Result']

In [0]:
df = pd.DataFrame({
    "CUI": [umls_term[0] for umls_term in umls_terms],
    "First_Inst": [umls_term[1] for umls_term in umls_terms],
    "ID": [umls_term[2] for umls_term in umls_terms],
    "Semantic_Types": [umls_term[3] for umls_term in umls_terms],
    "Synonyms":[umls_term[4] for umls_term in umls_terms]
})
df['Semantic_Types'] = df['Semantic_Types'].astype(str)

In [0]:
import numpy as np
df["Label"] = np.nan
for med_prob in med_problems:
  df.loc[df['Semantic_Types'].str.contains(med_prob), "Label"] = "Medical Problem"

for treat in treatments:
  df.loc[df['Semantic_Types'].str.contains(treat), "Label"] = "Treatment"

for test in tests:
  df.loc[df['Semantic_Types'].str.contains(test), "Label"] = "Test"

df = df.dropna(subset=["Label"]).reset_index(drop=True)
df.head()

In [0]:
# create dictionary for exact matching
# Ex: Medical Problem: [list of terms indicating a problem]
dict_labeling = {}
for label in df.Label.unique():
  list_labeling = [item for sublist in df[df.Label == label].Synonyms.tolist() for item in sublist]
  dict_labeling[label] = list_labeling

## Remove Nonsense labels that appeared with high frequency in discharge notes

In [0]:
# removing terms from the medical problems indicator list
list_of_remove = ['to', '1', '2', '3', '4', '5', 'no']
for i in list_of_remove:
  idx = dict_labeling["Medical Problem"].index(i)
  dict_labeling["Medical Problem"] = dict_labeling["Medical Problem"][0:idx] + dict_labeling["Medical Problem"][(idx+1):]
# dict_labeling["Treatment"].index('for')

In [0]:
# removing term from the treatment indicator list
idx = dict_labeling["Treatment"].index('for')
dict_labeling["Treatment"] = dict_labeling["Treatment"][0:idx] + dict_labeling["Treatment"][(idx+1):]

In [0]:
# removing term from the test indicator list
idx = dict_labeling["Test"].index('his')
dict_labeling["Test"] = dict_labeling["Test"][0:idx] + dict_labeling["Test"][(idx+1):]

## Subsetting UMLS terms and creating PhraseMatcher patterns

In [0]:
prob_pattern_200 = list(nlp.pipe(dict_labeling["Medical Problem"][0:200000]))
with open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_200.pickle', 'wb') as f:
    pickle.dump(prob_pattern_200, f)
# prob_pattern_200 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_200.pickle', 'rb'))

In [0]:
prob_pattern_400 = list(nlp.pipe(dict_labeling["Medical Problem"][200000:400000]))
with open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_400.pickle', 'wb') as f:
    pickle.dump(prob_pattern_400, f)
# prob_pattern_200 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_400.pickle', 'rb'))

In [0]:
prob_pattern_600 = list(nlp.pipe(dict_labeling["Medical Problem"][400000:600000]))
with open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_600.pickle', 'wb') as f:
    pickle.dump(prob_pattern_600, f)
# prob_pattern_600 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_600.pickle', 'rb'))

In [0]:
prob_pattern_800 = list(nlp.pipe(dict_labeling["Medical Problem"][600000:]))
with open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_800.pickle', 'wb') as f:
    pickle.dump(prob_pattern_800, f)
# prob_pattern_800 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/prob_pattern_800.pickle', 'rb'))

In [0]:
del prob_pattern_200
del prob_pattern_400
del prob_pattern_600
del prob_pattern_800

In [0]:
treat_pattern_250 = list(nlp.pipe(dict_labeling["Treatment"][0:250000]))
with open('./drive/My Drive/MLHC Final Project/Pickled Patterns/treat_pattern_250.pickle', 'wb') as f:
    pickle.dump(treat_pattern_250, f)
# treat_pattern_250 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/treat_pattern_250.pickle', 'rb'))

In [0]:
treat_pattern_500 = list(nlp.pipe(dict_labeling["Treatment"][250000:500000]))
with open('./drive/My Drive/MLHC Final Project/Pickled Patterns/treat_pattern_500.pickle', 'wb') as f:
    pickle.dump(treat_pattern_500, f)
# treat_pattern_500 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/treat_pattern_500.pickle', 'rb'))

In [0]:
treat_pattern_750 = list(nlp.pipe(dict_labeling["Treatment"][500000:]))
with open('./drive/My Drive/MLHC Final Project/Pickled Patterns/treat_pattern_750.pickle', 'wb') as f:
    pickle.dump(treat_pattern_750, f)
# treat_pattern_750 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/treat_pattern_750.pickle', 'rb'))

In [0]:
del treat_pattern_250
del treat_pattern_500
del treat_pattern_750

In [0]:
test_pattern_50 = list(nlp.pipe(dict_labeling["Test"][:50000]))
with open('./drive/My Drive/MLHC Final Project/Pickled Patterns/test_pattern_50.pickle', 'wb') as f:
    pickle.dump(test_pattern_50, f)
# test_pattern_50 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/test_pattern_50.pickle', 'rb'))

In [0]:
test_pattern_100 = list(nlp.pipe(dict_labeling["Test"][50000:]))
with open('./drive/My Drive/MLHC Final Project/Pickled Patterns/test_pattern_100.pickle', 'wb') as f:
    pickle.dump(test_pattern_100, f)
# test_pattern_100 = pickle.load(open('./drive/My Drive/MLHC Final Project/Pickled Patterns/test_pattern_100.pickle', 'rb'))