In [9]:

from utils import *
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


## 1. Generate the data with target (speed) and features ($\overline{s}_K$ and coordinates of 10 nearest neighbors)

### Dataloading

In [3]:
# Load 4 bottleneck datasets
bottleneck_070 = load_data("Bottleneck_Data", "uo-180-070")
bottleneck_095 = load_data("Bottleneck_Data","uo-180-095")
bottleneck_120 = load_data("Bottleneck_Data","uo-180-120")
bottleneck_180 = load_data("Bottleneck_Data","uo-180-180")

# Load 8 corridor datasets
corridor_015 = load_data("Corridor_Data","ug-180-015")
corridor_030 = load_data("Corridor_Data","ug-180-030")
corridor_060 = load_data("Corridor_Data","ug-180-060")
corridor_085 = load_data("Corridor_Data","ug-180-085")
corridor_095 = load_data("Corridor_Data","ug-180-095")
corridor_110 = load_data("Corridor_Data","ug-180-110")
corridor_140 = load_data("Corridor_Data","ug-180-140")
corridor_230 = load_data("Corridor_Data","ug-180-230")

### Data Generating

In [4]:
# Calculate the speed by difference method

# B stands for bottleneck.
B_070_speed = add_speed(bottleneck_070)
B_095_speed = add_speed(bottleneck_095)
B_120_speed = add_speed(bottleneck_120)
B_180_speed = add_speed(bottleneck_180)

# The corridor is a ring, so it's named after R.
R_015_speed = add_speed(corridor_015)
R_030_speed = add_speed(corridor_030)
R_060_speed = add_speed(corridor_060)
R_085_speed = add_speed(corridor_085)
R_095_speed = add_speed(corridor_095)
R_110_speed = add_speed(corridor_110)
R_140_speed = add_speed(corridor_140)
R_230_speed = add_speed(corridor_230)

In [5]:
# example
B_070_speed

Unnamed: 0,ID,FRAME,X,Y,speed
0,1,219,130.771,772.562,2.318080
1,1,220,132.569,758.186,2.318080
2,1,221,134.659,745.493,2.058227
3,1,222,134.878,734.659,1.733794
4,1,223,133.482,724.252,1.680034
...,...,...,...,...,...
75330,148,1581,132.408,-572.917,1.439601
75331,148,1582,133.420,-582.078,1.474676
75332,148,1583,133.959,-590.682,1.379339
75333,148,1584,134.090,-598.287,1.216981


In [6]:
# In this dataset, the number of neighbors is always less than 10. It will be discarded.
# R_015 = generate_data(R_015_speed)

ValueError: need at least one array to concatenate

In [7]:
# Generate the data for training and testing

B_070 = generate_data(B_070_speed)
B_095 = generate_data(B_095_speed)
B_120 = generate_data(B_120_speed)
B_180 = generate_data(B_180_speed)

R_030 = generate_data(R_030_speed)
R_060 = generate_data(R_060_speed)
R_085 = generate_data(R_085_speed)
R_095 = generate_data(R_095_speed)
R_110 = generate_data(R_110_speed)
R_140 = generate_data(R_140_speed)
R_230 = generate_data(R_230_speed)

The shape of the dataset (containing features and targets) is  (74461, 22) .
The shape of the dataset (containing features and targets) is  (71682, 22) .
The shape of the dataset (containing features and targets) is  (54292, 22) .
The shape of the dataset (containing features and targets) is  (50903, 22) .
The shape of the dataset (containing features and targets) is  (11160, 22) .
The shape of the dataset (containing features and targets) is  (38620, 22) .
The shape of the dataset (containing features and targets) is  (53098, 22) .
The shape of the dataset (containing features and targets) is  (152072, 22) .
The shape of the dataset (containing features and targets) is  (113126, 22) .
The shape of the dataset (containing features and targets) is  (246617, 22) .
The shape of the dataset (containing features and targets) is  (203029, 22) .


In [8]:
# example
B_120, B_120.shape

(array([[ 4.42924892e+02, -3.44820000e+01,  1.87905000e+02, ...,
          1.06440000e+02,  6.13636000e+02,  1.61242480e+00],
        [ 2.33938664e+02, -6.39146000e+01,  9.16570000e+01, ...,
          2.85910000e+01,  3.51322000e+02,  1.54424670e+00],
        [ 2.84922238e+02,  1.12331000e+02,  7.44090000e+01, ...,
          1.40922000e+02,  4.25731000e+02,  1.75549167e+00],
        ...,
        [ 1.77242017e+02,  4.91553000e+01,  4.15140000e+01, ...,
          1.82490600e+02, -3.48997000e+02,  6.87244058e-01],
        [ 2.13447505e+02, -5.05753000e+01, -1.84820000e+01, ...,
          8.27600000e+01, -4.08993000e+02,  6.00618774e-01],
        [ 1.34248556e+02,  4.11666000e+01,  2.94250000e+01, ...,
          1.42584600e+02, -2.19237000e+02,  8.69087049e-01]]),
 (54292, 22))

In [9]:
R_230, R_230.shape

(array([[ 1.23161376e+02,  4.96312600e+01, -4.23871000e+01, ...,
          9.77401600e+01, -1.38354400e+02,  1.25690566e-02],
        [ 8.87130328e+01, -3.43075000e+01, -3.36562000e+01, ...,
          1.38014000e+01, -1.29623500e+02,  2.26189584e-02],
        [ 9.45078455e+01,  3.75800000e+00, -3.24495000e+01, ...,
         -1.39273160e+02,  1.98039000e+01,  3.34547106e-02],
        ...,
        [ 1.72466762e+02, -5.85712000e+01,  4.03910000e+01, ...,
          1.14371000e+02, -3.09413000e+02,  6.46219777e-01],
        [ 2.14974691e+02,  5.85712000e+01, -4.03910000e+01, ...,
          1.72942200e+02, -3.49804000e+02,  1.01256444e+00],
        [ 2.36756214e+02, -6.19752000e+01, -4.43890000e+01, ...,
          1.10967000e+02, -3.94193000e+02,  8.61863818e-01]]),
 (203029, 22))

### Save the processed data

In [None]:
np.savetxt("../processed_data/B_070.csv", B_070, delimiter=",")
np.savetxt("../processed_data/B_095.csv", B_095, delimiter=",")
np.savetxt("../processed_data/B_120.csv", B_120, delimiter=",")
np.savetxt("../processed_data/B_180.csv", B_180, delimiter=",")

np.savetxt("../processed_data/R_030.csv", R_030, delimiter=",")
np.savetxt("../processed_data/R_060.csv", R_060, delimiter=",")
np.savetxt("../processed_data/R_085.csv", R_085, delimiter=",")
np.savetxt("../processed_data/R_095.csv", R_095, delimiter=",")
np.savetxt("../processed_data/R_110.csv", R_110, delimiter=",")
np.savetxt("../processed_data/R_140.csv", R_140, delimiter=",")
np.savetxt("../processed_data/R_230.csv", R_230, delimiter=",")

## 2. Generate the dataset for training and testing

In [6]:
dataset_B = pd.read_csv("../processed_data/B_120.csv", header=None)
dataset_R = pd.read_csv("../processed_data/R_230.csv", header=None)

In [10]:
# Splitting dataset_B with shuffle
train_B, test_B = train_test_split(dataset_B, test_size=0.5, random_state=42, shuffle=True)

# Splitting dataset_R with shuffle
train_R, test_R = train_test_split(dataset_R, test_size=0.5, random_state=42, shuffle=True)

In [11]:
train_B.to_csv('../processed_data/train_B.csv', index=False)
test_B.to_csv('../processed_data/test_B.csv', index=False)
train_R.to_csv('../processed_data/train_R.csv', index=False)
test_R.to_csv('../processed_data/test_R.csv', index=False)