### Import libraries

In [2]:
import pandas as pd
import numpy as np
import datetime
from itertools import cycle
import geopandas as gpd
from shapely.geometry import Point
import random
import matplotlib.pyplot as plt

### Load Geodata

In [3]:
# load geodata
polygon = gpd.read_file("Gurugram_sample_Polygon.geojson")
print(polygon.head())

                                            geometry
0  POLYGON ((77.05980 28.46807, 77.03740 28.44785...


In [4]:
polygon.columns

Index(['geometry'], dtype='object')

### Set constants

In [43]:
# list of skills
skills = ['vaccination', 'pathology', 'artTest']

# prices of services
service_pricing_dict = {
    'vaccination': 100,
    'pathology': 200,
    'artTest': 50,
}

# time taken to carry out each service
service_duration_dict = {
    'vaccination': 20,
    'pathology': 25,
    'artTest': 20,
}

# size of dataframe
phleb_size = 30
order_size = 100
catchment_size = 5

# seed
# seed = random.randint(0, 9999)
seed = 1576

### Functions

In [66]:
def generate_coords(size=phleb_size+order_size+catchment_size):

    coords_df = pd.DataFrame()
    # bounds of geodata
    x_min, y_min, x_max, y_max = polygon.total_bounds
    points_x = []
    points_y = []
    points = []
    i=0
    while i < size:
        # generate random data within the bounds
        point = Point(random.uniform(x_min, x_max), random.uniform(y_min, y_max))
        if polygon.contains(point).any():
            points_x.append(point.x)
            points_y.append(point.y)
            points.append(point)
            i += 1
            
    coords_df['x'] = points_x
    coords_df['y'] = points_y
    
    return coords_df

In [67]:
coords_df = generate_coords()
order_coords = coords_df[:order_size]
phleb_home_coords = coords_df[order_size:order_size+phleb_size]
catchment_coords = coords_df[order_size+phleb_size:]

print(phleb_home_coords)

             x          y
100  77.085279  28.472285
101  77.086427  28.454162
102  77.077763  28.462192
103  77.089217  28.493514
104  77.097885  28.473147
105  77.056774  28.437709
106  77.121323  28.442278
107  77.088354  28.445468
108  77.127587  28.443850
109  77.065886  28.446775
110  77.120500  28.424896
111  77.109093  28.475805
112  77.066458  28.423196
113  77.045181  28.440902
114  77.066494  28.449487
115  77.095467  28.479540
116  77.136098  28.431897
117  77.048966  28.438462
118  77.098434  28.442533
119  77.099578  28.450525
120  77.075982  28.457121
121  77.136052  28.466068
122  77.129927  28.427133
123  77.123847  28.440567
124  77.084433  28.426712
125  77.132149  28.457912
126  77.125033  28.482960
127  77.057026  28.459014
128  77.091510  28.442086
129  77.077785  28.443168


In [68]:
def create_phleb_df(size=phleb_size, seed=seed):
    # random generator 
    rng = np.random.default_rng(seed) 

    df = pd.DataFrame()
    # start shift either at 6 or 7
    df['shift_start'] = rng.integers(6, 8, size)

    # start break 4 hours after of work
    df['break_start'] = df['shift_start'] + 4

    # shift ends after 8 hours 
    df['shift_end'] = df['shift_start'] + 8

    # random number of skills
    df['num_skills'] =  rng.integers(1, len(skills)+1, size) # each phleb must have at least 1 skill
    # randomly choose skills from list of skills based
    df['skillset'] = df.num_skills.apply(lambda x: str(rng.choice(skills, x, replace=False, shuffle=True))[1:-1]).str.replace("'",'')
    # one-hot encoded columns
    dummies = df['skillset'].str.get_dummies(sep=" ").add_prefix('expertise_')
    # join df with one-hot encoded columns
    df = pd.concat([df, dummies], axis=1)

    # carrying capacity
    df['capacity'] = 20 

    # cost of hiring
    df['cost'] = 800 * (1.1**(df['num_skills'] - 1)) # basepay of 800, 20% increase for additional skills

    # service rating
    df['service_rating'] = rng.lognormal(mean=4.5, sigma=0.1, size=size)
    df['service_rating'] = df['service_rating'] / df['service_rating'].max() *5.0 # scale to range of 0.0 to 5.0
    df['service_rating'] = df['service_rating'].round(decimals=1)

    # coordinates of phlebo's home
    df['home_long'] = phleb_home_coords['x'].to_numpy()
    df['home_lat'] = phleb_home_coords['y'].to_numpy()
    
    # set phleb id
    df['phleb_id'] = df.index

    df.drop(['num_skills', 'skillset'], inplace=True, axis=1)
    print(df)
    df.to_csv(f"../Simulated Data/phleb_data_{seed}.csv", index=False)
    return df

In [69]:
def create_catchment_df(size=catchment_size, seed=seed):
    # random generator 
    rng = np.random.default_rng(seed) 

    df = pd.DataFrame()
    
    # coordinates
    df['long'] = catchment_coords['x'].to_numpy()
    df['lat'] = catchment_coords['y'].to_numpy()

    print(df)
    df.to_csv(f"../Simulated Data/catchment_data_{seed}.csv", index=False)
    return df

In [70]:
def create_orders_df(size=order_size, seed=seed):
    # random generator 
    rng = np.random.default_rng(seed) 

    df = pd.DataFrame()

    # array of hours to choose from
    hours = np.arange(6,15) # orders start from 6am to 2pm
    # probability of each hour being chosen
    p_hour = np.array([0.6, 0.6, 0.6, 0.6, 0.6, 0.4, 0.4, 0.4, 0.4])
    # the probabilities are scaled so that they sum to 1
    p_hour /= p_hour.sum()
    df['order_start'] = rng.choice(hours, size=size, p=p_hour)

    # generate number of services chosen
    df['num_services'] =  rng.integers(1, len(skills)+1, size) # each order must have at least 1 skill
    # randomly choose skills from list of skills based
    df['services'] = df.num_services.apply(lambda x: str(rng.choice(skills, x, replace=False, shuffle=True))[1:-1]).str.replace("'",'') 
    # one-hot encoded columns
    dummies = df['services'].str.get_dummies(sep=" ").add_prefix('service_')
    # join df with one-hot encoded columns
    df = pd.concat([df, dummies], axis=1)

    # calculate order duration and price
    df['duration'] = 0 # initialise column
    df['price'] = 0 # initialise column
    for skill in skills:
        df['duration'] += service_duration_dict.get(skill) * df[f"service_{skill}"]
        df['price'] += service_pricing_dict[skill] * df[f"service_{skill}"]

    # buffer time between 10 and 20mins
    df['buffer'] = rng.integers(10, 21, size)

    df['capacity_needed'] = df['service_artTest']
    
    # coordinates
    df['long'] = order_coords['x'].to_numpy()
    df['lat'] = order_coords['y'].to_numpy()

    df['order_id'] = df.index

    # # generate order cancellation
    # p_cancellation = rng.integers(low=10, high=26) / 100
    # df['cancel'] = rng.binomial(n=1, p=p_cancellation, size=size) # refers to last minute cancellations
    # df['cancel_time'] = 1 # initialise with 1 so that rng will choose 0 if order is not cancelled
    # # calculate amount of time customer has to cancel an order in seconds
    # df.loc[df['cancel'] == 1, 'cancel_time'] = (df['order_start'] - pd.to_datetime(df['order_start'].dt.date)).dt.total_seconds()
    # # randomly choose number of seconds before appointment to cancel 
    # df['cancel_time'] = rng.integers(0, df['cancel_time'], size)
    # df.loc[df['cancel'] == 1, 'cancel_time'] = df['order_start'] - pd.to_timedelta(df['cancel_time'], unit='s')
  
    df.drop(['num_services', 'services'], inplace=True, axis=1)
    print(df)
    df.to_csv(f"../Simulated Data/order_data_{seed}.csv", index=False)
    return df

### Generate csv

In [71]:
# run functions
print(f"Seed: {seed}")
phleb = create_phleb_df()
orders = create_orders_df()
catchment_areas = create_catchment_df()

Seed: 1576
    shift_start  break_start  shift_end  expertise_artTest  \
0             7           11         15                  1   
1             7           11         15                  0   
2             7           11         15                  0   
3             6           10         14                  1   
4             6           10         14                  1   
5             6           10         14                  1   
6             6           10         14                  1   
7             7           11         15                  0   
8             7           11         15                  0   
9             7           11         15                  0   
10            7           11         15                  1   
11            6           10         14                  0   
12            7           11         15                  0   
13            7           11         15                  0   
14            6           10         14                  1 

In [73]:
# check percentage of orders between 6 and 10
len(orders[orders['order_start'] <= 10]) / len(orders)

0.74

In [None]:
orders

Unnamed: 0,order_start_hour,service_artTest,service_pathology,service_vaccination,duration,price,buffer,capacity_needed,long,lat,order_id
0,14,1,1,0,45,250,12,1,77.052425,28.432725,0
1,9,1,0,1,40,150,17,1,77.094098,28.421330,1
2,8,0,0,1,20,100,12,0,77.116251,28.428015,2
3,14,1,0,1,40,150,13,1,77.110315,28.443369,3
4,14,1,1,0,45,250,11,1,77.086019,28.425266,4
...,...,...,...,...,...,...,...,...,...,...,...
95,8,0,1,0,25,200,13,0,77.059369,28.432651,95
96,9,0,1,0,25,200,13,0,77.066566,28.442033,96
97,12,0,0,1,20,100,14,0,77.139943,28.442091,97
98,13,1,1,1,65,350,12,1,77.089099,28.479187,98


In [None]:
catchment_areas

Unnamed: 0,long,lat
0,77.090442,28.491796
1,77.048918,28.455809
2,77.133091,28.471413
3,77.089667,28.450325
4,77.108883,28.47532


In [None]:
phleb

Unnamed: 0,shift_start_hour,break_start_hour,shift_end_hour,expertise_artTest,expertise_pathology,expertise_vaccination,capacity,cost,rating,home_long,home_lat,phleb_id
0,7,11,15,1,1,1,20,968.0,3.8,77.086466,28.500081,0
1,7,11,15,0,0,1,20,800.0,4.2,77.096946,28.447814,1
2,7,11,15,0,1,0,20,800.0,4.5,77.133868,28.468962,2
3,6,10,14,1,1,1,20,968.0,5.0,77.10153,28.436697,3
4,6,10,14,1,1,1,20,968.0,3.5,77.074016,28.432366,4
5,6,10,14,1,1,1,20,968.0,4.2,77.070115,28.436855,5
6,6,10,14,1,1,1,20,968.0,4.1,77.09099,28.437849,6
7,7,11,15,0,0,1,20,800.0,4.4,77.132982,28.440888,7
8,7,11,15,0,1,0,20,800.0,4.2,77.072617,28.431548,8
9,7,11,15,0,0,1,20,800.0,3.7,77.108537,28.464879,9
