In [17]:
import pandas as pd
import numpy as np

Let us use $random$ to create a synthetic dataset for our routing problem. This will involve creating a dataframe with $n$ samples and three columns; for each node the first two columns will represent it's Euclidean coordinates, $x$ and $y$, while the third column will represent the demand $d$. We will represent the depot as the first entry, with coordinates $(0,0)$ and demand $0$.

We will create a function called $generateSyntheticDataset$, which will create the dataset described above, specific to the total area specified by the user, and whether the data points are to generated uniformly or normally. We first consider the uniform case.

Let $X, Y$ be the vectors associated with the $x,y$ coordinates of each node, and let $D$ the vector associated with the demand $d$ for each node.

In [18]:
def generateUniformSyntheticDataset(total_area, number_of_cities):
    #Determing the maximum distance from the origin based on the total_area
    length_of_square_side = np.sqrt(total_area)
    half_length = length_of_square_side/2
    distance = half_length
    
    #Defining X column
    X = [np.random.uniform(-distance,distance) for _ in range(1, number_of_cities+2)]
    #Defining Y column
    Y = [np.random.uniform(-distance,distance) for _ in range(1, number_of_cities+2)]
    #Defining demand column
    D = [np.random.uniform(0,1) for _ in range(1, number_of_cities+2)]

    synthetic_data_uniform = pd.DataFrame({'x':X, 'y':Y, 'd':D})
    synthetic_data_uniform = synthetic_data_uniform.round(3)
    #Setting origin to be home depot
    synthetic_data_uniform.iloc[0] = 0
    
    return(synthetic_data_uniform)

In [19]:
#Test
uniform_10_25 = generateUniformSyntheticDataset(100, 25)
uniform_10_25

Unnamed: 0,x,y,d
0,0.0,0.0,0.0
1,1.751,3.825,0.763
2,-4.705,-0.814,0.371
3,-1.721,0.159,0.98
4,-0.884,-0.974,0.292
5,-3.794,2.627,0.344
6,2.436,1.869,0.463
7,-4.497,-4.863,0.374
8,1.496,-2.91,0.945
9,-0.035,1.548,0.653


Exporting the dataset as a csv file

In [20]:
uniform_10_25.to_csv("uniform_10_25.csv", index = True)

We need a copy of the dataset without the origin.

In [21]:
def removeorigin(city_coordinates):
    city_coordinates = city_coordinates.iloc[1:] 
    return(city_coordinates)

In [22]:
uniform_10_25_w = removeorigin(uniform_10_25)
#Export
uniform_10_25_w.to_csv("uniform_10_25_w.csv", index = True)

In [23]:
uniform_10_25_w

Unnamed: 0,x,y,d
1,1.751,3.825,0.763
2,-4.705,-0.814,0.371
3,-1.721,0.159,0.98
4,-0.884,-0.974,0.292
5,-3.794,2.627,0.344
6,2.436,1.869,0.463
7,-4.497,-4.863,0.374
8,1.496,-2.91,0.945
9,-0.035,1.548,0.653
10,-0.309,-3.32,0.173


We now consider the normal case. 
Let $X, Y$ be the vectors associated with the $x,y$ coordinates of each node, and let $D$ the vector associated with the demand $d$ for each node. We can generate normalized data using the numpy.random module, but in order to generate data points that fit within a specified area, we will then need to use a scaling transformation.

In [24]:
def generateNormalSyntheticDataset(total_area, number_of_cities):

    #Let X, Y be continious standard normal distributions.
    X = np.random.standard_normal(number_of_cities+1)
    Y = np.random.standard_normal(number_of_cities+1)
    #Defining demand column
    D = [np.random.uniform(0,1) for _ in range(1, number_of_cities+2)]
    D = np.array(D)
    
    length_of_square_side = np.sqrt(total_area)
    
    #Determing the range of values generated
    min_x = min(X)
    min_y = min(Y)
    max_x = max(X)
    max_y = max(Y)
    
    #Normalizing with respect to the total area
    scaling_factor_X = length_of_square_side/(max_x - min_x)
    scaling_factor_Y = length_of_square_side/(max_y - min_y)
    normalized_X = X * scaling_factor_X
    normalized_Y = Y * scaling_factor_Y
    
    #Creating the dataframe
    n_synthetic_data_normal = pd.DataFrame({'x':normalized_X, 'y':normalized_Y, 'd':D})
    n_synthetic_data_normal = n_synthetic_data_normal.round(3)
    #Setting origin to be home depot
    n_synthetic_data_normal.iloc[0] = 0
    
    return(n_synthetic_data_normal)

In [25]:
normal_10_25 = generateNormalSyntheticDataset(100, 25)
normal_10_25

Unnamed: 0,x,y,d
0,0.0,0.0,0.0
1,1.531,-4.595,0.185
2,0.171,-1.782,0.492
3,1.591,-0.779,0.938
4,1.921,-0.786,0.005
5,5.901,0.18,0.448
6,-2.414,-4.067,0.343
7,-2.543,5.01,0.566
8,-0.334,1.29,0.377
9,-2.841,3.858,0.951


Exporting the dataset as a csv file

In [26]:
normal_10_25.to_csv("normal_10_25.csv", index = True)

We need a copy of the dataset without the origin.

In [27]:
normal_10_25_w = removeorigin(normal_10_25)
#Export
normal_10_25_w.to_csv("uniform_10_25_w.csv", index = True)

In [28]:
normal_10_25_w

Unnamed: 0,x,y,d
1,1.531,-4.595,0.185
2,0.171,-1.782,0.492
3,1.591,-0.779,0.938
4,1.921,-0.786,0.005
5,5.901,0.18,0.448
6,-2.414,-4.067,0.343
7,-2.543,5.01,0.566
8,-0.334,1.29,0.377
9,-2.841,3.858,0.951
10,2.291,-0.707,0.037


I will generate 10 copies using the uniform dataset generating process with an area of $1km^2$, with number of cities 25.

For cities = 25

In [29]:
for i in range(10):
    df = generateUniformSyntheticDataset(1, 25)
    filename = f'uniform_1_25:{i}.csv'  
    df.to_csv(filename, index=False) 
    #Removing the origin and creating the file name
    df = removeorigin(df)
    filename = f'uniform_1_25_wo:{i}.csv' 
    df.to_csv(filename, index=False)  

Now, repeating the process of generating 10 copies using each dataset generating process, with number of cities 25 but within area of $30km^2$, 

For cities = 25

In [30]:
for i in range(10):
    df = generateUniformSyntheticDataset(30, 25)
    filename = f'uniform_30_25:{i}.csv'  
    df.to_csv(filename, index=False)
    #Without the origin
    df = removeorigin(df)
    filename = f'uniform_30_25_wo:{i}.csv'  
    df.to_csv(filename, index=False)   

I will generate 10 copies using the normal dataset generating process with an area of $1km^2$, with number of cities 25.

In [31]:
for i in range(10):
    df = generateNormalSyntheticDataset(1, 25)
    filename = f'normal_1_25:{i}.csv'  
    df.to_csv(filename, index=False)
    #Without the origin
    df = removeorigin(df)
    filename = f'normal_1_25_wo:{i}.csv'  
    df.to_csv(filename, index=False)   

Now, repeating the process of generating 10 copies using each dataset generating process, with number of cities 25, 50, 100, 200 and 1000, but within area of $30km^2$, 

For cities = 25

In [32]:
for i in range(10):
    df = generateNormalSyntheticDataset(30, 25)
    filename = f'normal_30_25:{i}.csv'  
    df.to_csv(filename, index=False)
    #Without the origin
    df = removeorigin(df)
    filename = f'normal_30_25_wo:{i}.csv'  
    df.to_csv(filename, index=False)   

I will need to convert the csv files containing the Euclidean coordinates to csv files containing the distance matrices, which will contain the distance between each pair of cities. These will be saved in the folder "distance dataframes", while the csv files for the Euclidean city coordinates will be saved in the folder "datasets".