### Import libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
from itertools import cycle
import geopandas as gpd
from shapely.geometry import Point
import random
import matplotlib.pyplot as plt

### Load Geodata

In [2]:
# load geodata
polygon = gpd.read_file("Gurugram_sample_Polygon.geojson")
print(polygon.head())

                                            geometry
0  POLYGON ((77.05980 28.46807, 77.03740 28.44785...


In [3]:
polygon.columns

Index(['geometry'], dtype='object')

### Set constants

In [4]:
# list of skills
skills = ['vaccination', 'pathology', 'artTest']

# prices of services
service_pricing_dict = {
    'vaccination': 100,
    'pathology': 200,
    'artTest': 50,
}

# time taken to carry out each service
service_duration_dict = {
    'vaccination': 15,
    'pathology': 15,
    'artTest': 15,
}

# size of dataframe
phleb_size = 30
order_size = 100
catchment_size = 5

# seed
# seed = random.randint(0, 9999)
seed = 1576

### Functions

In [7]:
def generate_coords(size=phleb_size+order_size+catchment_size):

    coords_df = pd.DataFrame()
    # bounds of geodata
    x_min, y_min, x_max, y_max = polygon.total_bounds
    points_x = []
    points_y = []
    i=0

    while i < size:
        # generate random data within the bounds
        point = Point(random.uniform(x_min, x_max), random.uniform(y_min, y_max))
        if polygon.contains(point).any():
            points_x.append(point.x)
            points_y.append(point.y)
            i += 1
            
    coords_df['x'] = points_x
    coords_df['y'] = points_y

    # search for, remove and replace duplicates
    while (True in coords_df.duplicated(subset=['x','y'], keep='first').unique()): # while there are duplicates
        # remove duplicate coordinate pairs, only keeping first occurrence
        coords_df.drop_duplicates(subset=['x','y'], keep='first')
        n = size - len(coords_df)
        while j < n:
            point = Point(random.uniform(x_min, x_max), random.uniform(y_min, y_max))
            if polygon.contains(point).any():
                coords_df.append(pd.DataFrame([point.x, point.y], colums=['x', 'y']))
                j += 1

    print(f"Presence of duplicates: {True in coords_df.duplicated(subset=['x','y'], keep='first').unique()}")

    return coords_df

In [9]:
coords_df = generate_coords()
order_coords = coords_df[:order_size]
phleb_home_coords = coords_df[order_size:order_size+phleb_size]
catchment_coords = coords_df[order_size+phleb_size:]

Presence of duplicates: False


In [29]:
def create_phleb_df(size=phleb_size, seed=seed):
    # random generator 
    rng = np.random.default_rng(seed) 

    df = pd.DataFrame()
    # start shift either at 6 or 7
    df['shift_start'] = rng.integers(6, 8, size)

    # start break 4 hours after of work
    df['break_start'] = df['shift_start'] + 4

    # shift ends after 8 hours 
    df['shift_end'] = df['shift_start'] + 8

    # random number of skills
    df['num_skills'] =  rng.integers(1, len(skills)+1, size) # each phleb must have at least 1 skill
    # randomly choose skills from list of skills based
    df['skillset'] = df.num_skills.apply(lambda x: str(rng.choice(skills, x, replace=False, shuffle=True))[1:-1]).str.replace("'",'')
    # one-hot encoded columns
    dummies = df['skillset'].str.get_dummies(sep=" ").add_prefix('expertise_')
    # join df with one-hot encoded columns
    df = pd.concat([df, dummies], axis=1)

    # carrying capacity
    df['capacity'] = 20 

    # cost of hiring
    df['cost'] = 800 * (1.1**(df['num_skills'] - 1)) # basepay of 800, 20% increase for additional skills

    # service rating
    df['service_rating'] = rng.lognormal(mean=4.5, sigma=0.1, size=size)
    df['service_rating'] = df['service_rating'] / df['service_rating'].max() *5.0 # scale to range of 0.0 to 5.0
    df['service_rating'] = df['service_rating'].round(decimals=1)

    # coordinates of phlebo's home
    df['home_long'] = phleb_home_coords['x'].to_numpy()
    df['home_lat'] = phleb_home_coords['y'].to_numpy()
    
    # set phleb id
    df['phleb_id'] = df.index

    df.drop(['num_skills', 'skillset'], inplace=True, axis=1)
    # print(df)
    df.to_csv(f"../Simulated Data/phleb_data_{seed}.csv", index=False)
    return df

In [30]:
def create_catchment_df(size=catchment_size, seed=seed):
    # random generator 
    rng = np.random.default_rng(seed) 

    df = pd.DataFrame()
    
    # coordinates
    df['long'] = catchment_coords['x'].to_numpy()
    df['lat'] = catchment_coords['y'].to_numpy()

    # print(df)
    df.to_csv(f"../Simulated Data/catchment_data_{seed}.csv", index=False)
    return df

In [31]:
def create_orders_df(size=order_size, seed=seed):
    # random generator 
    rng = np.random.default_rng(seed) 

    df = pd.DataFrame()

    # array of hours to choose from
    hours = np.arange(6,15) # orders start from 6am to 2pm
    # probability of each hour being chosen
    p_hour = np.array([0.6, 0.6, 0.6, 0.6, 0.6, 0.4, 0.4, 0.4, 0.4])
    # the probabilities are scaled so that they sum to 1
    p_hour /= p_hour.sum()
    df['order_start'] = rng.choice(hours, size=size, p=p_hour)

    # generate number of services chosen
    df['num_services'] =  rng.integers(1, len(skills)+1, size) # each order must have at least 1 skill
    # randomly choose skills from list of skills based
    df['services'] = df.num_services.apply(lambda x: str(rng.choice(skills, x, replace=False, shuffle=True))[1:-1]).str.replace("'",'') 
    # one-hot encoded columns
    dummies = df['services'].str.get_dummies(sep=" ").add_prefix('service_')
    # join df with one-hot encoded columns
    df = pd.concat([df, dummies], axis=1)

    # calculate order duration and price
    df['duration'] = 0 # initialise column
    df['price'] = 0 # initialise column
    for skill in skills:
        df['duration'] += service_duration_dict.get(skill) * df[f"service_{skill}"]
        df['price'] += service_pricing_dict[skill] * df[f"service_{skill}"]

    # buffer time between 10 and 15mins
    df['buffer'] = rng.integers(10, 16, size)

    df['capacity_needed'] = df['service_artTest']
    
    # coordinates
    df['long'] = order_coords['x'].to_numpy()
    df['lat'] = order_coords['y'].to_numpy()

    df['order_id'] = df.index

    # # generate order cancellation
    # p_cancellation = rng.integers(low=10, high=26) / 100
    # df['cancel'] = rng.binomial(n=1, p=p_cancellation, size=size) # refers to last minute cancellations
    # df['cancel_time'] = 1 # initialise with 1 so that rng will choose 0 if order is not cancelled
    # # calculate amount of time customer has to cancel an order in seconds
    # df.loc[df['cancel'] == 1, 'cancel_time'] = (df['order_start'] - pd.to_datetime(df['order_start'].dt.date)).dt.total_seconds()
    # # randomly choose number of seconds before appointment to cancel 
    # df['cancel_time'] = rng.integers(0, df['cancel_time'], size)
    # df.loc[df['cancel'] == 1, 'cancel_time'] = df['order_start'] - pd.to_timedelta(df['cancel_time'], unit='s')
  
    df.drop(['num_services', 'services'], inplace=True, axis=1)
    # print(df)
    df.to_csv(f"../Simulated Data/order_data_{seed}.csv", index=False)
    return df

### Generate csv

In [32]:
# run functions
print(f"Seed: {seed}")
phleb = create_phleb_df()
orders = create_orders_df()
catchment_areas = create_catchment_df()

Seed: 1576


In [33]:
# check percentage of orders between 6 and 10
len(orders[orders['order_start'] <= 10]) / len(orders)

0.74

In [34]:
orders

Unnamed: 0,order_start,service_artTest,service_pathology,service_vaccination,duration,price,buffer,capacity_needed,long,lat,order_id
0,14,1,1,0,30,250,11,1,77.099900,28.426140,0
1,9,1,0,1,30,150,13,1,77.057073,28.450975,1
2,8,0,0,1,15,100,11,0,77.107310,28.434628,2
3,14,1,0,1,30,150,12,1,77.081583,28.433803,3
4,14,1,1,0,30,250,10,1,77.050092,28.440830,4
...,...,...,...,...,...,...,...,...,...,...,...
95,8,0,1,0,15,200,11,0,77.099552,28.443505,95
96,9,0,1,0,15,200,11,0,77.101782,28.462423,96
97,12,0,0,1,15,100,12,0,77.075426,28.431033,97
98,13,1,1,1,45,350,11,1,77.076850,28.484576,98


In [35]:
catchment_areas

Unnamed: 0,long,lat
0,77.087845,28.484314
1,77.097013,28.427854
2,77.070605,28.423955
3,77.12535,28.425828
4,77.132666,28.470201


In [36]:
phleb

Unnamed: 0,shift_start,break_start,shift_end,expertise_artTest,expertise_pathology,expertise_vaccination,capacity,cost,service_rating,home_long,home_lat,phleb_id
0,7,11,15,1,1,1,20,968.0,3.8,77.086423,28.443423,0
1,7,11,15,0,0,1,20,800.0,4.2,77.046575,28.447796,1
2,7,11,15,0,1,0,20,800.0,4.5,77.083636,28.437283,2
3,6,10,14,1,1,1,20,968.0,5.0,77.057376,28.446999,3
4,6,10,14,1,1,1,20,968.0,3.5,77.095458,28.464691,4
5,6,10,14,1,1,1,20,968.0,4.2,77.064161,28.42926,5
6,6,10,14,1,1,1,20,968.0,4.1,77.0829,28.424911,6
7,7,11,15,0,0,1,20,800.0,4.4,77.139349,28.455121,7
8,7,11,15,0,1,0,20,800.0,4.2,77.123466,28.43695,8
9,7,11,15,0,0,1,20,800.0,3.7,77.08428,28.473185,9
