In [1]:
import pandas as pd

In [2]:
codes = pd.read_csv('cleaned_discharge_summaries.csv').dropna()

def has_label(x, positive_icd9) :
    return any([positive_icd9 in y for y in x.split(';')])

codes['has_c1'] = codes['ICD9_CODE'].apply(lambda x : has_label(x, '285.1'))
codes['has_c2'] = codes['ICD9_CODE'].apply(lambda x : has_label(x, '285.2'))

codes.groupby(['has_c1', 'has_c2']).size()

data_for_pos_label = codes[(codes['has_c1'] == True) & (codes['has_c2'] == False)]
data_for_neg_label = codes[(codes['has_c2'] == True) & (codes['has_c1'] == False)]

data = pd.concat([data_for_neg_label, data_for_pos_label]).reset_index(drop=True)
data['target'] = data['has_c1'].apply(lambda x : 1 if x else 0)

In [8]:
from sklearn.model_selection import train_test_split
idxs = {}
idxs['train'], idxs['test'] = train_test_split(data.index, stratify=data['target'], test_size=0.2, random_state=12939)
idxs['train'], idxs['dev'] = train_test_split(idxs['train'], stratify=[data['target'][i] for i in idxs['train']], 
                                              test_size=0.15, random_state=13448)

In [9]:
keys = ['train', 'dev', 'test']
import numpy as np
texts = {}
labels = {}
for k in keys :
    filtered = data[data.index.isin(idxs[k])]
    texts[k] = list(filtered['TEXT'])
    labels[k] = list(filtered['target'])

In [10]:
df_texts = []
df_labels = []
df_exp_split = []

for k in keys :
    df_texts += texts[k]
    df_labels += labels[k]
    df_exp_split += [k]*len(texts[k])
    
df = pd.DataFrame({'text' : df_texts, 'label' : df_labels, 'exp_split' : df_exp_split}) 
df.to_csv('mimic_anemia_dataset.csv', index=False)

In [11]:
%run "../preprocess_data_BC.py" --data_file mimic_anemia_dataset.csv --output_file ./vec_anemia.p \
--word_vectors_type mimic --min_df 5

Vocabulary size :  16274
Found 16270 words in model out of 16274
