In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime

In [2]:
from pydbgen import pydbgen

In [3]:
faker = pydbgen.pydb()

In [4]:
# def genName(size=1):
#     return (np.array([faker.fake.name() for i in range(size)]))

def gender(size = 1, levels = ['M','F','O'], prob = [0.65, 0.34,0.01]):
    arr = np.random.choice(levels, size = size, p = prob)              # decide levels based on input probabilities
    return (arr)

def Age(size = 1, m = 44, std = 26):
    arr = np.random.normal(m, std, size).astype(int)                   # generate random integers of mean m and std spec
    arr = np.clip(arr, 8, 94)                                         # clip respecting boundaries
    return (arr)                                                       # generate the arrays of past dates

def pref_time(size = 1, m = 19, std = 2):
    arr = np.random.normal(m, std, size).astype(int)
    arr = np.clip(arr, 7, 23)
    return (arr)

def pref_day(size = 1, levels = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], prob = [0.04,0.14,0.10,0.08,0.26,0.21,0.17]):
    arr = np.random.choice(levels, size = size, p = prob)              # decide levels based on input probabilities
    return (arr)

In [5]:
def consulting_date(size):
    date_lst=pd.date_range(start="2019-01-01",end="2020-12-31").to_pydatetime().tolist()
    arr=np.random.choice(date_lst,size=size)
    return (arr)

In [6]:
#     """ Generate dummy Health Service ID numbers similar to NHS 10 digit format
#     See: https://www.nhs.uk/using-the-nhs/about-the-nhs/what-is-an-nhs-number/
#     """
import string
def patient_id(size):
    health_service_id_numbers = []
    for _ in range(size): 
        health_service_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=5)) 
        health_service_id_numbers.append(health_service_id)
    return health_service_id_numbers

In [7]:
def generate_arrival_times(size) -> list:
    #Generate and return arrival times.
        
    arrival_times = []

    # first 7 days in April 2019
    days_dates = [1, 2, 3, 4, 5, 6, 7]
    # have more people come in at the weekend - higher weights 
    day_weights = [0.5, 0.6, 0.7, 0.8, 0.9, 1, 0.5]
    days = random.choices(days_dates, day_weights, k=size)
    # this is just so each day has a different peak time
    days_time_modes = {day: random.random() for day in days_dates}

    for day in days:
        start = datetime(2019, 1, day, 00, 00, 00)
        end = datetime(2020, 12, day, 23, 59, 59)

        random_num = random.triangular(0, 1, days_time_modes[day])
        random_datetime = start + (end - start) * random_num
        arrival_times.append(random_datetime.strftime('%Y-%m-%d %H:%M:%S'))

    return arrival_times

In [8]:
def genDataset(size):
    data = {'patient_id':patient_id(size),
            'patient_age': Age(size),
            'patient_gender':gender(size),
            'pref_day':pref_day(size),
            'pref_time':pref_time(size), 
            'consulting_timestamp':generate_arrival_times(size)           
           }
    return (pd.DataFrame(data))

In [9]:
df = genDataset(3300000)

In [10]:
df.head()

Unnamed: 0,patient_id,patient_age,patient_gender,pref_day,pref_time,consulting_timestamp
0,Q65C7,67,M,Friday,18,2020-02-29 05:11:01
1,NTW6V,57,M,Friday,20,2020-09-30 20:15:32
2,MPK61,48,M,Thursday,22,2020-08-06 20:39:43
3,BB07J,29,M,Tuesday,16,2020-09-15 04:17:20
4,DZ5QM,30,F,Tuesday,19,2019-10-25 02:32:52


In [11]:
df['consulting_date'] = df['consulting_timestamp'].apply(lambda x: x.split(' ')[0])
df['consulting_time'] = df['consulting_timestamp'].apply(lambda x: x.split(' ')[1])
df.head()

Unnamed: 0,patient_id,patient_age,patient_gender,pref_day,pref_time,consulting_timestamp,consulting_date,consulting_time
0,Q65C7,67,M,Friday,18,2020-02-29 05:11:01,2020-02-29,05:11:01
1,NTW6V,57,M,Friday,20,2020-09-30 20:15:32,2020-09-30,20:15:32
2,MPK61,48,M,Thursday,22,2020-08-06 20:39:43,2020-08-06,20:39:43
3,BB07J,29,M,Tuesday,16,2020-09-15 04:17:20,2020-09-15,04:17:20
4,DZ5QM,30,F,Tuesday,19,2019-10-25 02:32:52,2019-10-25,02:32:52


In [12]:
df['pref_time'] = df['pref_time'].astype(str)
df['pref_time'] = df['pref_time'].apply(lambda x: x + ":00:00")
df.head()

Unnamed: 0,patient_id,patient_age,patient_gender,pref_day,pref_time,consulting_timestamp,consulting_date,consulting_time
0,Q65C7,67,M,Friday,18:00:00,2020-02-29 05:11:01,2020-02-29,05:11:01
1,NTW6V,57,M,Friday,20:00:00,2020-09-30 20:15:32,2020-09-30,20:15:32
2,MPK61,48,M,Thursday,22:00:00,2020-08-06 20:39:43,2020-08-06,20:39:43
3,BB07J,29,M,Tuesday,16:00:00,2020-09-15 04:17:20,2020-09-15,04:17:20
4,DZ5QM,30,F,Tuesday,19:00:00,2019-10-25 02:32:52,2019-10-25,02:32:52


In [13]:
df['pref_time'].unique()

array(['18:00:00', '20:00:00', '22:00:00', '16:00:00', '19:00:00',
       '21:00:00', '17:00:00', '14:00:00', '15:00:00', '23:00:00',
       '13:00:00', '10:00:00', '12:00:00', '11:00:00', '9:00:00'],
      dtype=object)

In [14]:
df['pref_time'].value_counts()

19:00:00    631900
18:00:00    631790
20:00:00    495121
17:00:00    494657
21:00:00    303036
16:00:00    302666
22:00:00    145547
15:00:00    145334
23:00:00     74899
14:00:00     54516
13:00:00     16039
12:00:00      3732
11:00:00       655
10:00:00        99
9:00:00          9
Name: pref_time, dtype: int64

In [15]:
df['consulting_date'] = pd.to_datetime(df['consulting_date'])
df['consulting_day'] = df['consulting_date'].dt.day_name()
df.head()

Unnamed: 0,patient_id,patient_age,patient_gender,pref_day,pref_time,consulting_timestamp,consulting_date,consulting_time,consulting_day
0,Q65C7,67,M,Friday,18:00:00,2020-02-29 05:11:01,2020-02-29,05:11:01,Saturday
1,NTW6V,57,M,Friday,20:00:00,2020-09-30 20:15:32,2020-09-30,20:15:32,Wednesday
2,MPK61,48,M,Thursday,22:00:00,2020-08-06 20:39:43,2020-08-06,20:39:43,Thursday
3,BB07J,29,M,Tuesday,16:00:00,2020-09-15 04:17:20,2020-09-15,04:17:20,Tuesday
4,DZ5QM,30,F,Tuesday,19:00:00,2019-10-25 02:32:52,2019-10-25,02:32:52,Friday


In [16]:
df = df.sort_values(by="consulting_timestamp")
df.head()

Unnamed: 0,patient_id,patient_age,patient_gender,pref_day,pref_time,consulting_timestamp,consulting_date,consulting_time,consulting_day
1097345,OIGYV,75,M,Tuesday,19:00:00,2019-01-01 03:58:31,2019-01-01,03:58:31,Tuesday
1694679,O35C2,27,M,Sunday,18:00:00,2019-01-01 15:38:30,2019-01-01,15:38:30,Tuesday
1312304,X62IX,38,M,Tuesday,19:00:00,2019-01-02 02:27:11,2019-01-02,02:27:11,Wednesday
477426,7RCSX,94,M,Sunday,16:00:00,2019-01-02 05:13:50,2019-01-02,05:13:50,Wednesday
2475586,3KIPU,70,F,Sunday,22:00:00,2019-01-02 05:18:12,2019-01-02,05:18:12,Wednesday


In [17]:
df.columns

Index(['patient_id', 'patient_age', 'patient_gender', 'pref_day', 'pref_time',
       'consulting_timestamp', 'consulting_date', 'consulting_time',
       'consulting_day'],
      dtype='object')

In [18]:
df = df[['patient_id', 'patient_age', 'patient_gender', 'pref_day', 'pref_time',
        'consulting_date', 'consulting_time', 'consulting_day', 'consulting_timestamp']]
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,patient_id,patient_age,patient_gender,pref_day,pref_time,consulting_date,consulting_time,consulting_day,consulting_timestamp
0,OIGYV,75,M,Tuesday,19:00:00,2019-01-01,03:58:31,Tuesday,2019-01-01 03:58:31
1,O35C2,27,M,Sunday,18:00:00,2019-01-01,15:38:30,Tuesday,2019-01-01 15:38:30
2,X62IX,38,M,Tuesday,19:00:00,2019-01-02,02:27:11,Wednesday,2019-01-02 02:27:11
3,7RCSX,94,M,Sunday,16:00:00,2019-01-02,05:13:50,Wednesday,2019-01-02 05:13:50
4,3KIPU,70,F,Sunday,22:00:00,2019-01-02,05:18:12,Wednesday,2019-01-02 05:18:12


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3300000 entries, 0 to 3299999
Data columns (total 9 columns):
 #   Column                Dtype         
---  ------                -----         
 0   patient_id            object        
 1   patient_age           int32         
 2   patient_gender        object        
 3   pref_day              object        
 4   pref_time             object        
 5   consulting_date       datetime64[ns]
 6   consulting_time       object        
 7   consulting_day        object        
 8   consulting_timestamp  object        
dtypes: datetime64[ns](1), int32(1), object(7)
memory usage: 214.0+ MB


In [20]:
df['patient_id'].nunique()

3211537

In [21]:
# new_data=doctor_data.drop_duplicates(subset=["NPI"," Ind_PAC_ID"],keep='first',inplace=False)
df = df.drop_duplicates(subset=['patient_id'], keep='first', inplace=False)

In [22]:
df.shape

(3211537, 9)

In [23]:
df.to_csv(r"D:\Appto_Healthcare\Main_Datasets\Patient_Data_Generated.csv",index=False)