/
utils.py
273 lines (259 loc) · 13 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
text_dir = "../text"
data_dir = "../data/"
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer as Cvec
from sklearn.feature_extraction.text import TfidfVectorizer as Tvec
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.linear_model import LogisticRegression as LR
import gc
from copy import deepcopy
from collections import defaultdict
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import roc_auc_score,classification_report,roc_curve
inter_dir = "..//intermediate_result//"
from collections import defaultdict
import pdb
label_field = "Cerebrovascular disease present"
def get_consolidated_doc_types():
doc_type_consolidation = {
'adult_triage_note': ['Adult Triage Note'],
'discharge_summary': ['Discharge Summary - Medical', 'Discharge Summary.', 'Discharge Summary - General', 'Discharge Summary - Surgery Short', 'Discharge Summary - Orthopedic Surgery', 'Discharge Summary - Stroke Neurology', 'Discharge Summary Thoracic Surgery', 'Transfer Summary.'],
'ed_physician_notes': ['ED Physician Handover Report'],
'history_and_physical_examination':['History and Physical', 'History & Physical Examination.', 'Admitting Trauma History and Physical Assessment', 'History & Physical Examination'],
'inpatient_consult_report':['Inpatient Consult Report.'],
'inpatient_consultation':['Inpatient Consultation', 'Acute Pain Service Summary Note'],
'inpatient_operative~procedure_report':['Inpatient Operative/Procedure Report'],
'med_surg_met~notmet_assessment':['Med Surg MET/NOT-MET Assessment','Med Surg MET/NOT-MET Assessment Flowsheet', 'Mental Health MET/NOT-MET Assessment Flowsheet', 'Mental Health MET/NOT-MET Assessment'],
'neurological_diagnostics':['EEG Preliminary Report', 'Neurological Diagnostics'],
'non-physician_progress_notes':['MPR', 'MPR - OB'],
'nursing_notes': ['Clinical Record', 'Pain Assessment', 'Intake and Output', 'Patient Care', 'Patient Assessment', 'AcuityPlus Inpatient Classification', 'Patient Assessment Tools', 'ED UCC - Intake and Output', 'Neurological Observation', 'Surgical Assessment and History - Nursing', 'Patient Assessment Neuro', 'Day Surgery / 24 Hour', 'AcuityPlus Mental Health Patient Classification'],
'nursing_transfer_report':['Nursing Transfer Report - ED to IP', 'Nursing Transfer Report - IP to IP', 'Nursing Transfer Report - PACU to IP', 'Nursing Transfer Report - Mental Health'],
'outpatient_consultation':['Outpatient Consultation'],
'pharmacy_care_plan':['Pharmacy Care Plan'],
'picc_~_midline_record':['PICC / Midline Record'],
'rehabilitation':['Neuro Rehabilitation'],
'social_work_assessment':['Social Work Assessment'],
'urological_diagnostics':['Urological Diagnostics']
}
return doc_type_consolidation
#VISITIDCODE
def get_group_ids(merged_df,id_field = "VISITIDCODE"):
pos_df = merged_df[merged_df[label_field] == "Yes"]
neg_df = merged_df[merged_df[label_field] == "No"]
pos_ids=list(pos_df[id_field].unique())
neg_ids=list(neg_df[id_field].unique())
return pos_ids,neg_ids
def prep_cui(merged_df,selected_types,pos_ids,neg_ids):
'''
Prepare the list of CUI as input and a list of 0/1 as the label for training supervised models.
INPUT:
merged_df: data frame that contians text and the label of CEVD
selected_types: list of strings
pos_ids,neg_ids : list of ids
OUTPUT:
total_txt: list string, each element is the combination of one patient's all CUIs from documents within the selected types
total_label: list of 0 and 1.
'''
# get all pos and neg ids
# get the selected columns, each type of document uses one field
selected_cols = deepcopy(selected_types)
selected_cols.append(label_field)
selected_cols.append('VISITIDCODE')
tmp_df = merged_df[selected_cols]
pos_df = tmp_df[tmp_df[label_field] == "Yes"]
neg_df = tmp_df[tmp_df[label_field] == "No"]
pos_txt = [] # list of string, each string is the combination of all selected documents belong to one positive patient
tmp_pos_ids = set(pos_df.VISITIDCODE.unique())
for guid in list(pos_ids):
if (guid in tmp_pos_ids):
tmp_row = pos_df[pos_df.VISITIDCODE == guid].squeeze()
tmp_list = [] # list of string, each element is CUIs from one document type
for t in selected_types:
if not pd.isna(tmp_row[t]):
tmp_list.append(tmp_row[t])
tmp_s = ' '.join(tmp_list)
pos_txt.append(tmp_s)
else:
pos_txt.append(" ")
pos_label = [1] * len(pos_txt)
neg_txt = []
tmp_neg_ids = set(neg_df.VISITIDCODE.unique())
for guid in list(neg_ids):
if (guid in tmp_neg_ids):
tmp_row = neg_df[neg_df.VISITIDCODE == guid].squeeze()
tmp_list = []
for t in selected_types:
if not pd.isna(tmp_row[t]):
tmp_list.append(tmp_row[t])
tmp_s = ' '.join(tmp_list)
neg_txt.append(tmp_s)
else:
neg_txt.append(" ")
neg_label = [0] * len(neg_txt)
pos_txt.extend(neg_txt)
total_txt = pos_txt
pos_label.extend(neg_label)
total_label = pos_label
return total_txt,total_label
def prep_txt(merged_df,selected_types,pos_ids,neg_ids,txt_name = "no_neg_concept"):
'''
Prepare the list of documents as input and a list of 0/1 as the label for training supervised models.
INPUT:
merged_df: data frame that contians text and the label of CEVD
selected_types: list of strings
pos_ids,neg_ids: list of ids
txt_name: The column name of the column in the dataframe that contains the target text
OUTPUT:
total_txt: list string, each element is the combination of one patient's all documents within the selected types
total_label: list of 0 and 1.
'''
tmp_df = merged_df[merged_df.name.isin(set(selected_types))]
#tmp_df is the data frame only has selected document types
pos_df = tmp_df[tmp_df[label_field] == "Yes"]
neg_df = tmp_df[tmp_df[label_field] == "No"]
pos_group = pos_df.groupby('visit_guid')
pos_txt = []
tmp_pos_ids = set(pos_df.visit_guid.unique())
for guid in list(pos_ids):
if (guid in tmp_pos_ids): # some patient may do not exist in tmp_df because they do not have specific types of document
tmp_list = pos_group.get_group(guid)[txt_name].tolist()
tmp_s = ' '.join([str(sss) for sss in tmp_list])
pos_txt.append(tmp_s)
else:
pos_txt.append(" ")
pos_label = [1] * len(pos_txt)
neg_group = neg_df.groupby('visit_guid')
neg_txt = []
tmp_neg_ids = set(neg_df.visit_guid.unique())
for guid in list(neg_ids):
if (guid in tmp_neg_ids):
tmp_list = neg_group.get_group(guid)[txt_name].tolist()
tmp_s = ' '.join([str(sss) for sss in tmp_list])
neg_txt.append(tmp_s)
else:
neg_txt.append(" ")
neg_label = [0] * len(neg_txt)
pos_txt.extend(neg_txt)
total_txt = pos_txt
pos_label.extend(neg_label)
total_label = pos_label
return total_txt,total_label
def get_word_imp(total_txt,total_label,tmp_vec,tmp_model):
'''
Get feature importance from 5 models produced in 5 fold cross validation. And report the performance of each model as
classification_report in scikit-learn。
INPUT:
total_txt: input of models, list of string.
total_label: label of models, list of binary value.
tmp_vec: a vectorizer from scikit-learn
tmp_model: a classification model
OUTPUT:
word_imp_dict: A dictionary, the key is the feature, which may be a word or CUI, depending on the content in total_txt. Value is the
sum of the importance of the 5 models generated in 5 fold cross validation.
cr_list: a list where each element is a classification report in dictionary form
'''
word_imp_dict = defaultdict(lambda:0)
# dictionary of importance. Key: words Value: average importances across 5 models
kf = StratifiedKFold(n_splits = 5,shuffle=True,random_state=123)
cr_list = []
for (train_idx,test_idx) in tqdm(kf.split(total_txt,total_label)):
p = 1
# X_train, y_train = [], []
# [(X_train.append(total_txt[i]), y_train.append(total_label[i])) for i in train_idx]
# X_test, y_test = [], []
# [(X_test.append(total_txt[i]), y_test.append(total_label[i])) for i in test_idx]
X_train = [total_txt[i] for i in train_idx]
X_test = [total_txt[i] for i in test_idx]
y_train = [total_label[i] for i in train_idx]
y_test = [total_label[i] for i in test_idx]
bow_train = tmp_vec.fit_transform(X_train)
bow_test = tmp_vec.transform(X_test)
tmp_model.fit(bow_train,y_train)
y_pred = tmp_model.predict_proba(bow_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred[:,1], pos_label=1)
tpr_ind = np.max(np.where(fpr < 0.015)[0])
tmp_dict = classification_report(y_test,y_pred[:,1]>=thresholds[tpr_ind],output_dict=True)
tmp_dict['auc'] = roc_auc_score(y_test, y_pred[:,1])
cr_list.append(tmp_dict)
matched_tpr = tpr[tpr_ind]
vocab = tmp_vec.get_feature_names_out()
score = tmp_model.feature_importances_
#high_score_idx = np.argsort(score)[-100:]
for idx in range(len(score)):
word_imp_dict[vocab[idx]] += score[idx]
print(matched_tpr)
#merged_cr = merge_cr_dict(cr_list)
return word_imp_dict,cr_list
def run_models(total_txt,total_label,candi_vec,candi_models,fpr_thres = 0.005):
'''
Grid search of vectorizer and model, performance was measured by 5 fold cross validation.Grid search for vectorizer and model. Other
performance metrics are observed after adjusting the classification threshold to achieve a certain specificity. Performance was measured
by 5 fold cross validation.
INPUT:
total_txt: list of text/cuis
total_label: list of binary values
candi_vec: list of scikit-learn's vectorizer
candi_models: list of supervised models
fpr_thres: threshold of false positive rate. Specificity = 1 - FPR
OUTPUT:
mean_sens_dict: a dict of dict of list, {vectorizer : {model : [list of sensitivity]}}. Stores the list sensitivities of different
combination of vectorizers and models
'''
kf = StratifiedKFold(n_splits = 5,shuffle=True,random_state=221)
# mean_sens_dict stores the list sensitivities of different combination of vectorizers and models
# {vectorizer : {model : [list of sensitivity]}}
mean_sens_dict = defaultdict(lambda:defaultdict(lambda:[]))
for (train_idx,test_idx) in tqdm(kf.split(total_txt,total_label)):
p = 1
X_train = [total_txt[i] for i in train_idx]
X_test = [total_txt[i] for i in test_idx]
y_train = [total_label[i] for i in train_idx]
y_test = [total_label[i] for i in test_idx]
#eval_set = [(X_train.iloc[indices[1]], Y_train.iloc[indices[1],0])]
for vec_idx in range(len(candi_vec)):
tmp_vec = candi_vec[vec_idx]
bow_train = tmp_vec.fit_transform(X_train)
bow_test = tmp_vec.transform(X_test)
for model_idx in range(len(candi_models)):
tmp_model = candi_models[model_idx]
tmp_model.fit(bow_train,y_train)
y_pred = tmp_model.predict_proba(bow_test)
fpr,tpr,thres = roc_curve(y_test,y_pred[:,1])
tpr_ind = np.max(np.where(fpr < fpr_thres))
matched_tpr = tpr[tpr_ind]
#tmp_auc = roc_auc_score(y_test,y_pred[:,1])
#if (p):
#print(matched_tpr)
#p=0
mean_sens_dict[vec_idx][model_idx].append(matched_tpr)
return mean_sens_dict
def merge_cr_dict(dict_list):
'''
Merge classification report(cr) dictionaries, by averaging all metrics in report dictionaries in a list
INPUT:
dict_list: list of cr dict
OUTPUT:
result_dict: a single cr dict
'''
#log = r'{}:{}, mean {} +- {}, std {}'
result_dict = defaultdict(lambda:[])
result_dict = deepcopy(dict_list[0])
result_dict['auc'] = [result_dict['auc']]
count = len(dict_list)
for k,v in result_dict.items():
if type(v) == dict:
for sub_k,sub_v in v.items():
result_dict[k][sub_k] = [sub_v]
for tmp_dict in dict_list[1:]:
result_dict[k][sub_k].append(tmp_dict[k][sub_k])
result_dict[k][sub_k] = np.mean(result_dict[k][sub_k])
for tmp_dict in dict_list[1:]:
result_dict['auc'].append(tmp_dict['auc'])
result_dict['auc'] = np.mean(result_dict['auc'])
return result_dict