In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
DIR = "D:/Workspace/MIMIC DATA/mimic-iii-clinical-database-1.4/"

In [3]:
notes_df = pd.read_csv(DIR + "NOTEEVENTS.csv", low_memory=False)
admissions_df = pd.read_csv(DIR + "ADMISSIONS.csv", low_memory=False)

# Data
## Admissions
Conversion of times to datetime type

In [4]:
admissions_df['ADMITTIME'] = admissions_df['ADMITTIME'].astype('datetime64[ns]')
admissions_df['DISCHTIME'] = admissions_df['DISCHTIME'].astype('datetime64[ns]')
admissions_df['DEATHTIME'] = admissions_df['DEATHTIME'].astype('datetime64[ns]')

Sort by subject and admission type and reset the data frame index.

In [5]:
admissions_df = admissions_df.sort_values(['SUBJECT_ID', 'ADMITTIME'])
admissions_df = admissions_df.reset_index(drop = True)

In [6]:
admissions_df['NEXT_ADMITTIME'] = admissions_df.groupby('SUBJECT_ID').ADMITTIME.shift(-1)
admissions_df['NEXT_ADMISSION_TYPE'] = admissions_df.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)

In [7]:
admissions_df[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE']['NEXT_ADMITTIME'] = pd.NaT
admissions_df[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE']['NEXT_ADMISSION_TYPE'] = np.NaN

Fill NA's with the next valid value. Previously sorted.

In [8]:
admissions_df[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admissions_df.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')

Obtain days to readmission: from discharge to next readmission

In [9]:
admissions_df['DAYS_TO_READMISSION'] = (admissions_df['NEXT_ADMITTIME'] - admissions_df['DISCHTIME']).dt.days

## Notes

In [10]:
discharge_notes = notes_df[notes_df['CATEGORY'] == "Discharge summary"]

Take the last row per admission

In [11]:
discharge_notes_ordered = discharge_notes.groupby(['SUBJECT_ID', 'HADM_ID']).nth(-1).reset_index()

## Merge Notes and Admissions

In [12]:
admissions_notes = pd.merge(
    admissions_df[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','DAYS_TO_READMISSION','NEXT_ADMITTIME','ADMISSION_TYPE','DEATHTIME']],
    discharge_notes_ordered[['SUBJECT_ID', 'HADM_ID', 'TEXT']], 
    on = ['SUBJECT_ID', 'HADM_ID'], how='left'
)

Remove NEWBORN admissions and create the target variable

In [13]:
adm_notes = admissions_notes[admissions_notes['ADMISSION_TYPE'] != 'NEWBORN']
adm_notes['READM_WITHIN_30'] = (adm_notes['DAYS_TO_READMISSION'] < 30).astype('int')

Exlude patients that died during the admission

In [14]:
adm_notes = adm_notes.loc[adm_notes['DEATHTIME'].isnull()]
adm_notes = adm_notes.sample(n = len(adm_notes))
adm_notes = adm_notes.reset_index(drop = True)

# Natural Language

In [15]:
import string
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
def clean_text(texts):
    texts = texts.fillna(' ')
    texts = texts.str.replace('\n',' ')
    texts = texts.str.replace('\r',' ')

    table = str.maketrans('', '', string.punctuation + '0123456789')
    texts = [text.lower().translate(table) for text in texts]

    return texts

In [18]:
adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])

In [19]:
stop_words = stopwords.words('english')
stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']

In [20]:
porter = PorterStemmer()

In [21]:
def tokenize_stem(text):
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [porter.stem(word) for word in words]
    return words

In [22]:
print(sum(adm_notes['READM_WITHIN_30'] == 1) )
print(len(adm_notes['READM_WITHIN_30']))

3121
45321
