# Getting started

In [94]:
import pandas as pd
import numpy as np
import random

import akiFlagger

### Let's start off by creating some toy data

In [95]:
np.random.seed(0) #seed for reproducibility
#pick admission dates from Jan 1, 2020 to July 1, 2020 (6 month period)
date_range = (pd.to_datetime('2020-01-01').value // 10**9, pd.to_datetime('2020-07-01').value // 10**9) 

#Generate random MRN #s, admission dates, and encounters
#Generate between 3 and 10 encounters for each patient
mrns = np.random.randint(10000, 20000, 100)
admns = pd.to_datetime(np.random.randint(date_range[0], date_range[1], 100), unit = 's')
encs = [np.random.randint(10000, 99999, np.random.randint(3,10)) for mrn, admn in zip(mrns, admns)]

#Combine the two dataframes
d1 = pd.DataFrame([mrns, admns]).T
d2 = pd.DataFrame(encs)

d1.columns = ['mrn', 'admn']
d2 = d2.add_prefix('enc_')

df = pd.concat([d1, d2], axis=1)
df = pd.melt(df, id_vars = ['mrn', 'admn'], value_name = 'enc').drop('variable', axis=1)

#Remove the duplicated & null values and reset the index
df = df[np.logical_and(~df.enc.isnull(), ~df.enc.duplicated())].reset_index(drop=True) 
print('Number of patients: {}\nNumber of admissions: {}\nNumber of encounters: {}'.format(df.mrn.unique().shape[0],
                                                                                          df.admn.unique().shape[0],
                                                                                          df.enc.unique().shape[0]))
df.head()

Number of patients: 100
Number of admissions: 100
Number of encounters: 606


Unnamed: 0,mrn,admn,enc
0,12732,2020-02-27 11:42:42,25741.0
1,19845,2020-05-12 06:02:54,81382.0
2,13264,2020-01-10 11:16:57,89464.0
3,14859,2020-06-07 12:27:38,75180.0
4,19225,2020-03-26 21:16:49,94917.0


In [102]:
df['time'] = df.admn + np.array([random.choice(time_deltas) for i in range(df.shape[0])])
df

Unnamed: 0,mrn,admn,enc,time
0,12732,2020-02-27 11:42:42,25741.0,2020-02-29 05:42:42
1,19845,2020-05-12 06:02:54,81382.0,2020-05-15 12:02:54
2,13264,2020-01-10 11:16:57,89464.0,2020-01-07 11:16:57
3,14859,2020-06-07 12:27:38,75180.0,2020-06-06 00:27:38
4,19225,2020-03-26 21:16:49,94917.0,2020-03-29 03:16:49
...,...,...,...,...
601,18448,2020-03-09 17:00:54,55387.0,2020-03-05 17:00:54
602,11634,2020-02-20 11:04:24,57456.0,2020-02-23 11:04:24
603,16521,2020-03-10 23:00:27,83735.0,2020-03-14 17:00:27
604,17742,2020-06-25 21:38:55,39764.0,2020-06-30 15:38:55


In [93]:
time_deltas = pd.timedelta_range(start='-5 days', end='5 days', freq='6H')
time_deltas

TimedeltaIndex(['-5 days +00:00:00', '-5 days +06:00:00', '-5 days +12:00:00',
                '-5 days +18:00:00', '-4 days +00:00:00', '-4 days +06:00:00',
                '-4 days +12:00:00', '-4 days +18:00:00', '-3 days +00:00:00',
                '-3 days +06:00:00', '-3 days +12:00:00', '-3 days +18:00:00',
                '-2 days +00:00:00', '-2 days +06:00:00', '-2 days +12:00:00',
                '-2 days +18:00:00', '-1 days +00:00:00', '-1 days +06:00:00',
                '-1 days +12:00:00', '-1 days +18:00:00',   '0 days 00:00:00',
                  '0 days 06:00:00',   '0 days 12:00:00',   '0 days 18:00:00',
                  '1 days 00:00:00',   '1 days 06:00:00',   '1 days 12:00:00',
                  '1 days 18:00:00',   '2 days 00:00:00',   '2 days 06:00:00',
                  '2 days 12:00:00',   '2 days 18:00:00',   '3 days 00:00:00',
                  '3 days 06:00:00',   '3 days 12:00:00',   '3 days 18:00:00',
                  '4 days 00:00:00',   '4 days 06:00