In [None]:
import pandas as pd
import numpy as np
from string import punctuation

import gensim
from nltk import word_tokenize
from sklearn.model_selection import train_test_split


# Admissions Data

In [None]:
# strict copd coding
strict_icd9 = [
    "49120",
    "49121",
    "49122",
    "49320",
    "49321",
    "49322",
    "496",
]

# regular copd coding
reg_icd9 = [
    "4911",
    "4920",
    "4928",
]

df = pd.read_csv("~/Documents/data/mimic/DIAGNOSES_ICD.csv")
n = df.HADM_ID.nunique()
copd_hadmids = df[df.ICD9_CODE.isin(strict_icd9 + reg_icd9)].HADM_ID.unique()
n_copd = len(copd_hadmids)
print(f"Admission: {n}")
print(f"COPD Admissions: {n_copd}")

patients = pd.read_csv("~/Documents/data/mimic/PATIENTS.csv", parse_dates=['DOB', 'DOD', 'DOD_HOSP'])
print(f"Num patients: {patients['SUBJECT_ID'].nunique()}")
print(f"Num female: {patients[patients['GENDER'] == 'F']['SUBJECT_ID'].nunique()}")

admission_cols = [
    'HADM_ID',
    'ADMISSION_TYPE',
    'ADMITTIME',
    'DISCHTIME',
    'DEATHTIME',
    'EDREGTIME',
    'EDOUTTIME',
    'HOSPITAL_EXPIRE_FLAG',
    'HAS_CHARTEVENTS_DATA',
]

tmp = pd.read_csv("~/Documents/data/mimic/ADMISSIONS.csv", parse_dates=['ADMITTIME', 'DISCHTIME','DEATHTIME', 'EDREGTIME', 'EDOUTTIME',])[admission_cols]

# concat primary dx onto admissions
admits = tmp.merge(df, on=['HADM_ID']).drop_duplicates(subset=["HADM_ID"])

# get rid of spurrious admissions and ignore newborns
admits = admits[(admits['DISCHTIME'] > admits['ADMITTIME']) & (admits.ADMISSION_TYPE != "NEWBORN")]

# add age information
admits = admits.merge(patients[['SUBJECT_ID', 'DOB']], on='SUBJECT_ID', how='left')
admits['age'] = admits.apply(lambda x: (x['ADMITTIME'].date() - x['DOB'].date()).days // 365.242, axis=1)

# tag copd admissions
admits['copd'] = admits.HADM_ID.isin(copd_hadmids)

# get the type and time of the next admission
admits.sort_values(by=['SUBJECT_ID', 'ADMITTIME'],inplace=True)
admits['next_admit_time'] = admits.groupby('SUBJECT_ID').ADMITTIME.shift(-1)
admits['next_admit_type'] = admits.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)
# if the next admission is elective, nullify and back fill
admits.loc[admits.next_admit_type == "ELECTIVE", 'next_admit_time'] = pd.NaT
admits.loc[admits.next_admit_type == "ELECTIVE", 'next_admit_type'] = np.nan
admits[['next_admit_time','next_admit_type']] = admits.groupby(['SUBJECT_ID'])[['next_admit_time','next_admit_type']].fillna(method = 'bfill')

# compute readmission stats
admits['readmit_time'] = admits.groupby('SUBJECT_ID').apply(lambda x: x['next_admit_time'] - x['DISCHTIME']).reset_index(level=0, drop=True)
admits['7d_readmit'] = (admits['readmit_time'].dt.total_seconds() < 7 * 24 * 3600).astype(int)
admits['30d_readmit'] = (admits['readmit_time'].dt.total_seconds() < 30 * 24 * 3600).astype(int)

In [None]:
def print_summary(df):
    gb = df.groupby(['copd','7d_readmit']).HADM_ID.count()
    non_rate = gb[0][1] / gb[0].sum()
    copd_rate = gb[1][1] / gb[1].sum()
    print("Non-COPD 7d readmit rate: {:.1%}".format(non_rate))
    print("COPD 7d readmit rate:     {:.1%}".format(copd_rate))
    print('')

    gb = df.groupby(['copd','30d_readmit']).HADM_ID.count()
    non_rate = gb[0][1] / gb[0].sum()
    copd_rate = gb[1][1] / gb[1].sum()
    print("Non-COPD 30d readmit rate: {:.1%}".format(non_rate))
    print("COPD 30d readmit rate:     {:.1%}".format(copd_rate))
    print('')

    gb = df[df.DEATHTIME.notnull()].drop_duplicates(subset=['SUBJECT_ID']).groupby('copd').size()
    print("Non-COPD mortality rate: {:.1%}".format(gb[0] / df[df.copd == False].shape[0]))
    print("COPD mortality rate:     {:.1%}".format(gb[1] / df[df.copd].shape[0]))

print("<65 Admissions")
print("-"*25)
print_summary(admits[admits['age'] < 65])

print("\n\n65+ Admissions")
print("-"*25)
print_summary(admits[admits['age'] >= 65])

print("\n\nAll Admissions")
print("-"*25)
print_summary(admits)

# Discharge Notes Data

In [None]:
# subjects that died in the hosp
deceased_subj_ids = admits[admits.DEATHTIME.notnull()].SUBJECT_ID.unique()
# subjects w/ at least one copd related admission
copd_subj_ids = admits[admits.copd].SUBJECT_ID.unique()
# all admissions for subjects w/ at least one copd related admission
hadm_ids_w_copd = admits[admits.SUBJECT_ID.isin(copd_subj_ids)].HADM_ID.unique()


print('Loading medical notes...')

chunk_reader = pd.read_csv("~/Documents/data/mimic/NOTEEVENTS.csv", chunksize=100000, usecols=['SUBJECT_ID','HADM_ID', 'CHARTDATE','CATEGORY', 'DESCRIPTION', 'TEXT',])
chunk_li = []
iteration = 0
for chunk in chunk_reader:
    
    if iteration % 5 == 0:
        print(f"Iteration {iteration}")
        # keep only admissions for subjects that had at least one copd admit
#     chunk_li.append(chunk[(chunk['HADM_ID'].isin(hadm_ids_w_copd))])
        # keep just the discharge summaries
        chunk_li.append(chunk[(chunk['CATEGORY'] == 'Discharge summary')])
    iteration += 1
    
print("Done.")

notes = pd.concat(chunk_li, ignore_index=True)
# keep only one discharge summary per admission
notes = notes.sort_values(by=['SUBJECT_ID', 'HADM_ID', 'CHARTDATE']).groupby(['HADM_ID']).nth(-1)
cols = ['HADM_ID', 'SUBJECT_ID','age', 'copd', 'HOSPITAL_EXPIRE_FLAG','ADMISSION_TYPE', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME','next_admit_time', 'next_admit_type','30d_readmit',]
notes = notes.merge(admits[cols], on=['SUBJECT_ID','HADM_ID'], how='inner')
notes.head()

# Pre-Process Notes

In [None]:
def preprocess_text(df):
    # This function preprocesses the text by filling not a number 
    # and replacing new lines ('\n') and carriage returns ('\r')
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT = df.TEXT.str.replace('\n',' ')
    df.TEXT = df.TEXT.str.replace('\r',' ')
    return df

X = preprocess_text(notes[notes.HOSPITAL_EXPIRE_FLAG == 0]).TEXT
y = notes[notes.HOSPITAL_EXPIRE_FLAG == 0]['30d_readmit']

# Generate Tokens

In [None]:
my_stop_words = ['the','and','to','of','was','with','a','on','in','for','name','is','patient','s','he','at',
                  'as','or','one','she','his','her','am','were','you','pt','pm','by','be','had','your','this',
                  'date','from','there','an','that','p','are','have','has','h','but','o','namepattern','which',
                  'every','also','should','if','it','been','who','during', 'x']

def basic_tokenizer(text, stop_words=[]):
    # tokenize the text by replacing punctuation and numbers with spaces and lowercase all words
    punc_list = punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t)
    tokens = word_tokenize(text)
    return [t for t in tokens if t not in stop_words]

# word2vec model

In [None]:
wv = gensim.models.KeyedVectors.load_word2vec_format('/Users/kevin/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', binary=True)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F