In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
DIR_ = "C:/Users/javie/Documents/Workspace/MIMIC DATA/mimic-iii-clinical-database-1.4/"
DIR = "D:/Workspace/MIMIC DATA/mimic-iii-clinical-database-1.4/"

In [3]:
notes_df = pd.read_csv(DIR + "NOTEEVENTS.csv", low_memory=False)
admissions_df = pd.read_csv(DIR + "ADMISSIONS.csv", low_memory=False)

# Data
## Admissions
Conversion of times to datetime type

In [4]:
admissions_df['ADMITTIME'] = admissions_df['ADMITTIME'].astype('datetime64[ns]')
admissions_df['DISCHTIME'] = admissions_df['DISCHTIME'].astype('datetime64[ns]')
admissions_df['DEATHTIME'] = admissions_df['DEATHTIME'].astype('datetime64[ns]')

Sort by subject and admission type and reset the data frame index.

In [5]:
admissions_df = admissions_df.sort_values(['SUBJECT_ID', 'ADMITTIME'])
admissions_df = admissions_df.reset_index(drop = True)

In [6]:
admissions_df['NEXT_ADMITTIME'] = admissions_df.groupby('SUBJECT_ID').ADMITTIME.shift(-1)
admissions_df['NEXT_ADMISSION_TYPE'] = admissions_df.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)

In [7]:
admissions_df[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE']['NEXT_ADMITTIME'] = pd.NaT
admissions_df[admissions_df['NEXT_ADMISSION_TYPE'] == 'ELECTIVE']['NEXT_ADMISSION_TYPE'] = np.NaN

Fill NA's with the next valid value. Previously sorted.

In [8]:
admissions_df[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admissions_df.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')

Obtain days to readmission: from discharge to next readmission

In [9]:
admissions_df['DAYS_TO_READMISSION'] = (admissions_df['NEXT_ADMITTIME'] - admissions_df['DISCHTIME']).dt.days

## Notes

In [10]:
discharge_notes = notes_df[notes_df['CATEGORY'] == "Discharge summary"]

Take the last row per admission

In [11]:
discharge_notes_ordered = discharge_notes.groupby(['SUBJECT_ID', 'HADM_ID']).nth(-1).reset_index()

## Merge Notes and Admissions

In [12]:
admissions_notes = pd.merge(
    admissions_df[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','DAYS_TO_READMISSION','NEXT_ADMITTIME','ADMISSION_TYPE','DEATHTIME']],
    discharge_notes_ordered[['SUBJECT_ID', 'HADM_ID', 'TEXT']], 
    on = ['SUBJECT_ID', 'HADM_ID'], how='left'
)

Remove NEWBORN admissions and create the target variable

In [13]:
adm_notes = admissions_notes[admissions_notes['ADMISSION_TYPE'] != 'NEWBORN']
adm_notes['READM_WITHIN_30'] = (adm_notes['DAYS_TO_READMISSION'] < 30).astype('int')

Exlude patients that died during the admission

In [14]:
adm_notes = adm_notes.loc[adm_notes['DEATHTIME'].isnull()]
adm_notes = adm_notes.sample(n = len(adm_notes))
adm_notes = adm_notes.reset_index(drop = True)

In [15]:
adm_notes

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DAYS_TO_READMISSION,NEXT_ADMITTIME,ADMISSION_TYPE,DEATHTIME,TEXT,READM_WITHIN_30
0,78100,149085,2143-09-10 18:03:00,2143-09-18 15:10:00,4.0,2143-09-22 21:42:00,EMERGENCY,NaT,Admission Date: [**2143-9-10**] ...,1
1,3030,123465,2114-07-05 12:00:00,2114-07-14 19:20:00,,NaT,ELECTIVE,NaT,Admission Date: [**2114-7-5**] Discharg...,0
2,31520,199872,2132-03-12 15:01:00,2132-06-12 16:10:00,2.0,2132-06-15 01:15:00,ELECTIVE,NaT,Admission Date: [**2132-3-12**] ...,1
3,25382,137283,2194-08-04 16:35:00,2194-08-13 13:45:00,,NaT,EMERGENCY,NaT,"Name: [**Known lastname 9275**], [**Known fir...",0
4,15453,168028,2110-12-07 23:49:00,2110-12-13 10:56:00,1.0,2110-12-14 22:44:00,EMERGENCY,NaT,Admission Date: [**2110-12-7**] Dischar...,1
...,...,...,...,...,...,...,...,...,...,...
45316,59505,139500,2120-03-14 14:22:00,2120-03-16 14:00:00,,NaT,EMERGENCY,NaT,Admission Date: [**2120-3-14**] ...,0
45317,7427,161280,2136-03-26 15:23:00,2136-04-04 11:47:00,1896.0,2141-06-13 23:13:00,EMERGENCY,NaT,Admission Date: [**2136-3-27**] Dischar...,0
45318,19691,129760,2115-09-22 05:02:00,2115-10-10 15:05:00,,NaT,EMERGENCY,NaT,Admission Date: [**2115-9-22**] Dischar...,0
45319,7129,191192,2177-02-23 17:00:00,2177-02-26 15:30:00,,NaT,EMERGENCY,NaT,Admission Date: [**2177-2-23**] ...,0


In [16]:
adm_notes.to_csv(DIR + 'readmission.csv', index=False)