# preprocess_data.py explanation and flow

In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020-05-27 15:00
# @Author  : Xiaoke Huang
# @Email   : xiaokehuang@foxmail.com
# %%
from utils.feature_utils import compute_feature_for_one_seq, encoding_features, save_features
from argoverse.data_loading.argoverse_forecasting_loader import ArgoverseForecastingLoader
from argoverse.map_representation.map_api import ArgoverseMap
import matplotlib.pyplot as plt
import os
from utils.config import DATA_DIR, LANE_RADIUS, OBJ_RADIUS, OBS_LEN, INTERMEDIATE_DATA_DIR
from tqdm import tqdm
import pickle
# %matplotlib inline


if __name__ == "__main__":
    am = ArgoverseMap()
    for folder in os.listdir(DATA_DIR):
        if folder not in ['train', 'val', 'test']:
            continue
            
        print(f"folder: {folder}")
        afl = ArgoverseForecastingLoader(os.path.join(DATA_DIR, folder))
        norm_center_dict = {}
        for name in tqdm(afl.seq_list):
            afl_ = afl.get(name)
            path, name = os.path.split(name)
            name, ext = os.path.splitext(name)

            agent_feature, obj_feature_ls, lane_feature_ls, norm_center = compute_feature_for_one_seq(
                afl_.seq_df, am, OBS_LEN, LANE_RADIUS, OBJ_RADIUS, viz=False, mode='nearby')
            feature_df = encoding_features(
                agent_feature, obj_feature_ls, lane_feature_ls)
            save_features(feature_df, name, os.path.join(
                INTERMEDIATE_DATA_DIR, f"{folder}_intermediate"))
            print(f"Processed and saved features for {name + '.csv'}")

            norm_center_dict[name] = norm_center
        
        with open(os.path.join(INTERMEDIATE_DATA_DIR, f"{folder}-norm_center_dict.pkl"), 'wb') as f:
            pickle.dump(norm_center_dict, f, pickle.HIGHEST_PROTOCOL)
            # print(pd.DataFrame(df['POLYLINE_FEATURES'].values[0]).describe())


# %%


# %%


folder: train


100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 81.56it/s]

Processed and saved features for 2645.csv
Processed and saved features for 4791.csv
Processed and saved features for 3700.csv
Processed and saved features for 3861.csv
Processed and saved features for 3828.csv





## 1. compute_feature_for_one_seq in feature_utils.py function explanation
```
compute_feature_for_one_seq(afl_.seq_df, am, OBS_LEN, LANE_RADIUS, OBJ_RADIUS, viz=False, mode='nearby')
```

- It is called inside the preprocess_data.py
- This function converts raw data into feature representation for raw data.
- In the VectorNet paper, they used first two seconds as observation and  2 - 5 seconds for trajectory prediction. so, OBS_LEN is 20 because each scenario consists of 5 seconds at 10 Hz, meaning that each timestamp is 0.1 second --> 20 sequences will be 2 seconds

In [None]:
def compute_feature_for_one_seq(traj_df: pd.DataFrame, am: ArgoverseMap, obs_len: int = 20, lane_radius: int = 5, obj_radius: int = 10, viz: bool = False, mode='rect', query_bbox=[-100, 100, -100, 100]) -> List[List]:
    """
    return lane & track features
    args:
        mode: 'rect' or 'nearby'
    returns:
        agent_feature_ls:
            list of (doubeld_track, object_type, timetamp, track_id, not_doubled_groudtruth_feature_trajectory)
        obj_feature_ls:
            list of list of (doubled_track, object_type, timestamp, track_id)
        lane_feature_ls:
            list of list of lane a segment feature, formatted in [left_lane, right_lane, is_traffic_control, is_intersection, lane_id]
        norm_center np.ndarray: (2, )
    """
    # normalize timestamps
    traj_df['TIMESTAMP'] -= np.min(traj_df['TIMESTAMP'].values)
    seq_ts = np.unique(traj_df['TIMESTAMP'].values)

    seq_len = seq_ts.shape[0]
    city_name = traj_df['CITY_NAME'].iloc[0]
    agent_df = None
    agent_x_end, agent_y_end, start_x, start_y, query_x, query_y, norm_center = [
        None] * 7
    # agent traj & its start/end point
    for obj_type, remain_df in traj_df.groupby('OBJECT_TYPE'):
        if obj_type == 'AGENT':
            agent_df = remain_df
            start_x, start_y = agent_df[['X', 'Y']].values[0]
            agent_x_end, agent_y_end = agent_df[['X', 'Y']].values[-1]
            query_x, query_y = agent_df[['X', 'Y']].values[obs_len-1]
            norm_center = np.array([query_x, query_y])
            break
        else:
            raise ValueError(f"cannot find 'agent' object type")

    # prune points after "obs_len" timestamp
    # [FIXED] test set length is only `obs_len`
    traj_df = traj_df[traj_df['TIMESTAMP'] <=
                      agent_df['TIMESTAMP'].values[obs_len-1]]

    assert (np.unique(traj_df["TIMESTAMP"].values).shape[0]
            == obs_len), "Obs len mismatch"

    # search nearby lane from the last observed point of agent
    # FIXME: nearby or rect?
    # lane_feature_ls = get_nearby_lane_feature_ls(
    #     am, agent_df, obs_len, city_name, lane_radius, norm_center)
    lane_feature_ls = get_nearby_lane_feature_ls(
        am, agent_df, obs_len, city_name, lane_radius, norm_center, mode=mode, query_bbox=query_bbox)
    # pdb.set_trace()

    # search nearby moving objects from the last observed point of agent
    obj_feature_ls = get_nearby_moving_obj_feature_ls(
        agent_df, traj_df, obs_len, seq_ts, norm_center)
    # get agent features
    agent_feature = get_agent_feature_ls(agent_df, obs_len, norm_center)

    # vis
    if viz:
        for features in lane_feature_ls:
            show_doubled_lane(
                np.vstack((features[0][:, :2], features[0][-1, 3:5])))
            show_doubled_lane(
                np.vstack((features[1][:, :2], features[1][-1, 3:5])))
        for features in obj_feature_ls:
            show_traj(
                np.vstack((features[0][:, :2], features[0][-1, 2:])), features[1])
        show_traj(np.vstack(
            (agent_feature[0][:, :2], agent_feature[0][-1, 2:])), agent_feature[1])

        plt.plot(agent_x_end - query_x, agent_y_end - query_y, 'o',
                 color=color_dict['AGENT'], markersize=7)
        plt.plot(0, 0, 'x', color='blue', markersize=4)
        plt.plot(start_x-query_x, start_y-query_y,
                 'x', color='blue', markersize=4)
        plt.show()

    return [agent_feature, obj_feature_ls, lane_feature_ls, norm_center]

In [42]:
import numpy as np

In [15]:
print('OBS_LEN:', OBS_LEN)
print('LANE_RADIUS:', LANE_RADIUS)
print('OBJ_RADIUS:', OBJ_RADIUS)
    
afl = ArgoverseForecastingLoader(os.path.join(DATA_DIR, folder))
for name in tqdm(afl.seq_list):
    afl_ = afl.get(name) # returns ArgoverseForecastingLoader object that reads .csv file
    
traj_df = afl_.seq_df # basically it is Dataframe of the csv file
am = ArgoverseMap()
obs_len = OBS_LEN
lane_radius = LANE_RADIUS
obj_radius = OBJ_RADIUS
mode = 'nearby' # 'nearby' or 'rect'
query_bbox = [-100, 100, -100, 100]

print(traj_df)

OBS_LEN: 20
LANE_RADIUS: 30
OBJ_RADIUS: 30
      TIMESTAMP                              TRACK_ID OBJECT_TYPE           X  \
0       0.00000  00000000-0000-0000-0000-000000000000          AV  598.663624   
1       0.00000  00000000-0000-0000-0000-000000051661      OTHERS  618.404171   
2       0.00000  00000000-0000-0000-0000-000000051798      OTHERS  582.502438   
3       0.00000  00000000-0000-0000-0000-000000051757      OTHERS  650.240829   
4       0.00000  00000000-0000-0000-0000-000000051599      OTHERS  595.241947   
...         ...                                   ...         ...         ...   
1038    4.89916  00000000-0000-0000-0000-000000051894      OTHERS  596.621870   
1039    4.89916  00000000-0000-0000-0000-000000051639       AGENT  602.637295   
1040    4.89916  00000000-0000-0000-0000-000000051889      OTHERS  597.087316   
1041    4.89916  00000000-0000-0000-0000-000000051771      OTHERS  594.939561   
1042    4.89916  00000000-0000-0000-0000-000000051632      OTHERS 

In [22]:
# Time Normalization so that the sequence starts at 0
traj_df['TIMESTAMP'] -= np.min(traj_df['TIMESTAMP'].values)
seq_ts = np.unique(traj_df['TIMESTAMP'].values)
seq_len = seq_ts.shape[0]

print('Normalized timestamp: \n', seq_ts)
print('sequence length:', seq_len)

Normalized timestamp: 
 [0.         0.10002953 0.19654495 0.29651338 0.39852524 0.50251335
 0.60243654 0.69699281 0.79970425 0.89752984 1.00244039 1.09576762
 1.19675922 1.29768014 1.39789224 1.49737501 1.59750032 1.69619334
 1.8003788  1.90153521 2.0120312  2.09697974 2.19333982 2.29298574
 2.40065807 2.49518925 2.59894401 2.69540894 2.79657811 2.89352387
 3.00350517 3.09673125 3.19999939 3.29636294 3.40343642 3.49622393
 3.59391522 3.69679397 3.80828524 3.89221054 3.99273527 4.08895117
 4.18673015 4.29252023 4.39606696 4.49331379 4.59667361 4.69836873
 4.80369681 4.89915991]
sequence length: 50


In [57]:
city_name = traj_df['CITY_NAME'].iloc[0]
agent_df = None
agent_x_end, agent_y_end, start_x, start_y, query_x, query_y, norm_center = [None] * 7

for obj_type, remain_df in traj_df.groupby('OBJECT_TYPE'):
    if obj_type == 'AGENT':
        agent_df = remain_df
        start_x, start_y = agent_df[['X', 'Y']].values[0]
        agent_x_end, agent_y_end = agent_df[['X', 'Y']].values[-1]
        query_x, query_y = agent_df[['X', 'Y']].values[obs_len-1]
        norm_center = np.array([query_x, query_y])
traj_df = traj_df[traj_df['TIMESTAMP'] <= agent_df['TIMESTAMP'].values[obs_len-1]]
assert (np.unique(traj_df["TIMESTAMP"].values).shape[0] == obs_len), "Obs len mismatch"
# this function is explained below
lane_feature_ls = get_nearby_lane_feature_ls(am, agent_df, obs_len, city_name, lane_radius, norm_center, mode=mode, query_bbox=query_bbox)
# this function is explained below
obj_feature_ls = get_nearby_moving_obj_feature_ls(agent_df, traj_df, obs_len, seq_ts, norm_center)
# this function is explained below
agent_feature = get_agent_feature_ls(agent_df, obs_len, norm_center)

### 1.1 get_nearby_lane_feature_ls function in lane_utils.py
```
get_nearby_lane_feature_ls(am, agent_df, obs_len, city_name, lane_radius, norm_center, mode=mode, query_bbox=query_bbox)
```
- This is called inside compute_feature_for_one_seq function as it computes lane features
- computes lane features
- centerlane consists of multiple x, y, z coordinates where z is nan, so it's 2D coordinates acutally. centerlane coordinate is extracted from argoverse-api with lane_id as a key. So, assume centerlane is a polygon, a list of [x, y] center coordinate of the lane. So, this list of [x, y] represents a "way" of a road

In [59]:
def get_nearby_lane_feature_ls(am, agent_df, obs_len, city_name, lane_radius, norm_center, has_attr=False, mode='nearby', query_bbox=None):
    '''
    compute lane features
    args:
        norm_center: np.ndarray
        mode: 'nearby' return nearby lanes within the radius; 'rect' return lanes within the query bbox
        **kwargs: query_bbox= List[int, int, int, int]
    returns:
        list of list of lane a segment feature, formatted in [left_lane, right_lane, is_traffic_control, is_intersection, lane_id]
    '''
    lane_feature_ls = []
    if mode == 'nearby':
        query_x, query_y = agent_df[['X', 'Y']].values[obs_len-1]
        nearby_lane_ids = am.get_lane_ids_in_xy_bbox(query_x, query_y, city_name, lane_radius) # returns lane ids in [query_x - 5, query_y - 5, query_x + 5, query_y + 5]
        for lane_id in nearby_lane_ids:
            traffic_control = am.lane_has_traffic_control_measure(lane_id, city_name) # return Bool
            is_intersection = am.lane_is_in_intersection(lane_id, city_name) # returns Bool
            centerlane = am.get_lane_segment_centerline(lane_id, city_name)
            # normalize to last observed timestamp point of agent
            centerlane[:, :2] -= norm_center
            halluc_lane_1, halluc_lane_2 = get_halluc_lane(centerlane, city_name)
            lane_feature_ls.append([halluc_lane_1, halluc_lane_2, traffic_control, is_intersection, lane_id])
    return lane_feature_ls

In [71]:
centerlane # normalized based on last observed timestamp point of agent

array([[-28.58607131,   6.75708774,          nan],
       [-26.86963019,   6.82157199,          nan],
       [-25.15344902,   6.89267061,          nan],
       [-23.43726786,   6.96376923,          nan],
       [-21.72108669,   7.03486786,          nan],
       [-20.00490553,   7.10596648,          nan],
       [-18.28927696,   7.18927595,          nan],
       [-16.57365677,   7.27277024,          nan],
       [-14.85803657,   7.35626453,          nan],
       [-13.14241637,   7.43975882,          nan]])

#### 1.1.1 get_halluc_lane function in lane_utils.py

```
get_halluc_lane(centerlane, city_name)
```
- This is called inside the get_nearby_lane_feature_ls function
- It creates hallucinated coordinates that could be left and right boundaries (edgs) of the lane
- These edges coordinates are calculated by the following:
    - calculate dx by substracting two consecutive centerline coordinates: a direction from st to en
    - normalizes dx by norm(dx): unit vector in the direction
    - rotate_quat: 90 degree counter clockwise rotation
    - e1 = rotate_quat @ dx / norm: 90 degree counter clockwise rotation of the unit vector (used to infer left edge)
    - e2 = rotate_quat.T @ dx / norm: 90 degree clockwise rotation, same as -e1
    - lane_1: adding e1 to st and en, referring left lane boundary
    - lane_2: adding e1 to st and en, referring right lane boundary

In [None]:
from utils.config import LANE_WIDTH
def get_halluc_lane(centerlane, city_name):
    """
    return left & right lane based on centerline
    args:
    returns:
        doubled_left_halluc_lane, doubled_right_halluc_lane, shaped in (N-1, 3)
    """
    if centerlane.shape[0] <= 1:
        raise ValueError('shape of centerlane error.')

    half_width = LANE_WIDTH[city_name] / 2 # LANE_WIDTH = {'MIA': 3.84, 'PIT': 3.97}
    rotate_quat = np.array([[0.0, -1.0], [1.0, 0.0]])
    halluc_lane_1, halluc_lane_2 = np.empty(
        (0, centerlane.shape[1]*2)), np.empty((0, centerlane.shape[1]*2))
    for i in range(centerlane.shape[0]-1):
        st, en = centerlane[i][:2], centerlane[i+1][:2]
        dx = en - st # direction from st to en
        norm = np.linalg.norm(dx) # basically euclidean distance between two centerpoint
        # dx / norm : normalizes dx to get a unit vector in the direction from st to en
        # roate_quat : standard 90 degree rotation matrix
        # So, e1 rotates 90 degree cocunter clockwise, and e2 is direct opposite of e1, which is 90 degree clock wise from the original, dx / norm
        e1, e2 = rotate_quat @ dx / norm, rotate_quat.T @ dx / norm # basically e2 = - e1
        lane_1 = np.hstack(
            (st + e1 * half_width, centerlane[i][2], en + e1 * half_width, centerlane[i+1][2]))
        lane_2 = np.hstack(
            (st + e2 * half_width, centerlane[i][2], en + e2 * half_width, centerlane[i+1][2]))
        # print(halluc_lane_1, )
        halluc_lane_1 = np.vstack((halluc_lane_1, lane_1))
        halluc_lane_2 = np.vstack((halluc_lane_2, lane_2))
    return halluc_lane_1, halluc_lane_2

In [78]:
rotate_quat = np.array([[0.0, -1.0], [1.0, 0.0]])
st, en = centerlane[0][:2], centerlane[0+1][:2]

In [99]:
print('st:', st)
print('end:', en)

st: [-28.58607131   6.75708774]
end: [-26.86963019   6.82157199]


In [97]:
en

array([-26.86963019,   6.82157199])

In [74]:
dx = en - st
dx

array([1.71644112, 0.06448424])

In [75]:
norm = np.linalg.norm(dx)
norm

1.7176519818435505

In [None]:
rotate_quat = np.array([[0.0, -1.0], [1.0, 0.0]])

In [81]:
e1, e2 = rotate_quat @ dx / norm, rotate_quat.T @ dx / norm

In [98]:
print('e1: ', e1)
print('e2: ', e2)

e1:  [-0.03754209  0.99929505]
e2:  [ 0.03754209 -0.99929505]


In [100]:
dx / norm

array([0.99929505, 0.03754209])

### 1.2 get_nearby_moving_obj_feature_ls function in object_utils.py
```
get_nearby_moving_obj_feature_ls(agent_df, traj_df, obs_len, seq_ts, norm_center)
```
- this function is called inside compute_feature_for_one_seq
- computes features of nearby moving objects
- I fixed EXIST_THRESHOLD value to be 10 from 50 because traj_df is already pruned by 20 timesteps as first 2 seconds are used for observations, and therefore, setting the threshold as 50 timesteps will return empty list because any moving object total timesteps will always be <= 20. So, appearing at least 50% of total timesteps, which is 10, would be reasonable threshold by convention

In [124]:
from utils.config import EXIST_THRESHOLD, OBJ_RADIUS

def get_nearby_moving_obj_feature_ls(agent_df, traj_df, obs_len, seq_ts, norm_center):
    """
    args:
    returns: list of list, (doubled_track, object_type, timestamp, track_id)
    """
    obj_feature_ls = []
    query_x, query_y = agent_df[['X', 'Y']].values[obs_len-1]
    p0 = np.array([query_x, query_y])
    for track_id, remain_df in traj_df.groupby('TRACK_ID'):
        if remain_df['OBJECT_TYPE'].iloc[0] == 'AGENT':
            continue

        if len(remain_df) < EXIST_THRESHOLD or get_is_track_stationary(remain_df):
            continue

        xys, ts = None, None

        xys = remain_df[['X', 'Y']].values # other object coordinates
        ts = remain_df["TIMESTAMP"].values

        p1 = xys[-1] # most recent coordinate
        if np.linalg.norm(p0 - p1) > OBJ_RADIUS: # distance from ego position to other object
            continue

        xys -= norm_center  # normalize to last observed timestamp point of agent
        xys = np.hstack((xys[:-1], xys[1:])) # [[first xy, second xy], [second xy, third xy], ...]
        
        ts = (ts[:-1] + ts[1:]) / 2 # [(first stamp + second stamp) / 2, (second + third) / 2 , ...]

        obj_feature_ls.append(
            [xys, remain_df['OBJECT_TYPE'].iloc[0], ts, track_id])
    return obj_feature_ls

In [141]:
EXIST_THRESHOLD

10

In [108]:
remain_df

Unnamed: 0,TIMESTAMP,TRACK_ID,OBJECT_TYPE,X,Y,CITY_NAME
371,1.696193,00000000-0000-0000-0000-000000051876,OTHERS,595.366076,855.053742,MIA
391,1.800379,00000000-0000-0000-0000-000000051876,OTHERS,595.432209,854.998254,MIA
412,1.901535,00000000-0000-0000-0000-000000051876,OTHERS,595.231849,855.070501,MIA


In [130]:
OBJ_RADIUS

30

In [132]:
xys

array([[-6.68409285, 39.74868874, -6.6179599 , 39.69320097],
       [-6.6179599 , 39.69320097, -6.81831994, 39.76544813]])

#### 1.2.1 get_is_track_stationary function in object_utils.py
```
get_is_track_stationary(remain_df)
```
- this function is called inside the get_nearby_moving_obj_feature_ls
- this function checks if the track is stationary

In [122]:
from utils.config import VELOCITY_THRESHOLD
def get_is_track_stationary(track_df: pd.DataFrame) -> bool:
    """Check if the track is stationary.

    Args:
        track_df (pandas Dataframe): Data for the track
    Return:
        _ (bool): True if track is stationary, else False

    """
    vel = compute_velocity(track_df) # velocity computed using consecutive [x, y] coordinates
    sorted_vel = sorted(vel)
    threshold_vel = sorted_vel[int(len(vel) / 2)] # roughly median of velocity values
    return True if threshold_vel < VELOCITY_THRESHOLD else False # VELOCITY_THRESHOLD set as 1

In [123]:
VELOCITY_THRESHOLD

1.0

In [120]:
vel

[8958.634324644798, 9258.071129962122]

### 1.3 get_agent_feature_ls function in agent_utils.py
```
get_agent_feature_ls(agent_df, obs_len, norm_center)
```
- this function is called inside compute_feature_for_one_seq function
- computes agent features
- returns [xys, object_type, ts, track_id, gt_xys] where xys in the from of [xs, ys, xe, ye] for vetor representation, ts is average timestamp of consecutive coordinates representing each segment ([xs, ys, xe, ye]), gt_xys is [xs, ys, xe, ye] for last 3 seconds

In [None]:
def get_agent_feature_ls(agent_df, obs_len, norm_center):
    """
    args:
    returns: 
        list of (doubeld_track, object_type, timetamp, track_id, not_doubled_groudtruth_feature_trajectory)
    """
    xys, gt_xys = agent_df[["X", "Y"]].values[:obs_len], agent_df[[
        "X", "Y"]].values[obs_len:]
    xys -= norm_center  # normalize to last observed timestamp point of agent
    gt_xys -= norm_center  # normalize to last observed timestamp point of agent
    xys = np.hstack((xys[:-1], xys[1:]))

    ts = agent_df['TIMESTAMP'].values[:obs_len]
    ts = (ts[:-1] + ts[1:]) / 2

    return [xys, agent_df['OBJECT_TYPE'].iloc[0], ts, agent_df['TRACK_ID'].iloc[0], gt_xys]

## 2. encoding_features in feature_utils.py function explanation
```
encoding_features(agent_feature, obj_feature_ls, lane_feature_ls)
```

- It is called inside the preprocess_data.py
- agent_feature: list of (doubeld_track, object_type, timestamp, track_id, not_doubled_groudtruth_feature_trajectory)
    - [ [xs, ys, xe, ye], 'AGENT', ts, track_id ]
- obj_feature_ls: list of list of (doubled_track, object_type, timestamp, track_id)
    - [ [obect 1 features], [object 2 features] ] where each object features = [ [xs, ys, xe, ye], 'object_type', ts, track_id ]
- lane_feature_ls: list of list of lane a segment feature, formatted in [left_lane, right_lane, is_traffic_control, is_intersection, lane_id]
- This function takes the above arguments and creates polyline features.
    - for agent and objects: array(xs, ys, xe, ye, object_type, ts, polyline_id)
    - for lane: array(xs, ys, zs, xe, ye, ze, polyline_id)
- Then, to concatenate these two ppolyline features, do the following trick:
    - change agent and object polyline features to (xs, ys, xe, ye, timestamp, NULL, NULL, polyline_id). Here object_type no longer exists
    - change lane polyline features to (xs, ys, xe, ye, NULL, zs, ze, polyline_id)
    - Then concat in vertical stack way
- Finally, returns dataframe of [polyline_features, offset_gt, traj_id2mask, lane_id2mask, traj_nd.shape[0], lane_nd.shape[0]]
    - offset_gt is last 3seconds gt coordinates offset from last observed point. Basically gt[1] - gt[0], ...
    - traj_id2mask is a dict that takes polyline_id as a key and (previous length, current length) as value. previous length is agent and object polyline features shape[0] without current object shape[0] added. current length is after current object shape[0] added
    - lane_id2mask is the same but for lane
    - traj_nd.shape[0] is total polyline features length for all the objects and ego vehicle(agent)
    - lane_nd.shape[0] is the same for lane

In [None]:
def encoding_features(agent_feature, obj_feature_ls, lane_feature_ls):
    """
    args:
        agent_feature_ls:
            list of (doubeld_track, object_type, timestamp, track_id, not_doubled_groudtruth_feature_trajectory)
        obj_feature_ls:
            list of list of (doubled_track, object_type, timestamp, track_id)
        lane_feature_ls:
            list of list of lane a segment feature, formatted in [left_lane, right_lane, is_traffic_control, is_intersection, lane_id]
    returns:
        pd.DataFrame of (
            polyline_features: vstack[
                (xs, ys, xe, ye, timestamp, NULL, NULL, polyline_id),
                (xs, ys, xe, ye, NULL, zs, ze, polyline_id)
                ]
            offset_gt: incremental offset from agent's last obseved point,
            traj_id2mask: Dict[int, int]
            lane_id2mask: Dict[int, int]
        )
        where obejct_type = {0 - others, 1 - agent}

    """
    polyline_id = 0
    traj_id2mask, lane_id2mask = {}, {}
    gt = agent_feature[-1]
    traj_nd, lane_nd = np.empty((0, 7)), np.empty((0, 7))

    # encoding agent feature
    pre_traj_len = traj_nd.shape[0]
    agent_len = agent_feature[0].shape[0]
    # print(agent_feature[0].shape, np.ones(
    # (agent_len, 1)).shape, agent_feature[2].shape, (np.ones((agent_len, 1)) * polyline_id).shape)
    agent_nd = np.hstack((agent_feature[0], np.ones(
        (agent_len, 1)), agent_feature[2].reshape((-1, 1)), np.ones((agent_len, 1)) * polyline_id))
    assert agent_nd.shape[1] == 7, "obj_traj feature dim 1 is not correct"

    traj_nd = np.vstack((traj_nd, agent_nd))
    traj_id2mask[polyline_id] = (pre_traj_len, traj_nd.shape[0])
    pre_traj_len = traj_nd.shape[0]
    polyline_id += 1

    # encoding obj feature
    for obj_feature in obj_feature_ls:
        obj_len = obj_feature[0].shape[0]
        # assert obj_feature[2].shape[0] == obj_len, f"obs_len of obj is {obj_len}"
        if not obj_feature[2].shape[0] == obj_len:
            from pdb import set_trace;set_trace()
        obj_nd = np.hstack((obj_feature[0], np.zeros(
            (obj_len, 1)), obj_feature[2].reshape((-1, 1)), np.ones((obj_len, 1)) * polyline_id))
        assert obj_nd.shape[1] == 7, "obj_traj feature dim 1 is not correct"
        traj_nd = np.vstack((traj_nd, obj_nd))

        traj_id2mask[polyline_id] = (pre_traj_len, traj_nd.shape[0])
        pre_traj_len = traj_nd.shape[0]
        polyline_id += 1

    # incodeing lane feature
    pre_lane_len = lane_nd.shape[0]
    for lane_feature in lane_feature_ls:
        l_lane_len = lane_feature[0].shape[0]
        l_lane_nd = np.hstack(
            (lane_feature[0], np.ones((l_lane_len, 1)) * polyline_id))
        assert l_lane_nd.shape[1] == 7, "obj_traj feature dim 1 is not correct"
        lane_nd = np.vstack((lane_nd, l_lane_nd))
        lane_id2mask[polyline_id] = (pre_lane_len, lane_nd.shape[0])
        _tmp_len_1 = pre_lane_len - lane_nd.shape[0]
        pre_lane_len = lane_nd.shape[0]
        polyline_id += 1

        r_lane_len = lane_feature[1].shape[0]
        r_lane_nd = np.hstack(
            (lane_feature[1], np.ones((r_lane_len, 1)) * polyline_id)
        )
        assert r_lane_nd.shape[1] == 7, "obj_traj feature dim 1 is not correct"
        lane_nd = np.vstack((lane_nd, r_lane_nd))
        lane_id2mask[polyline_id] = (pre_lane_len, lane_nd.shape[0])
        _tmp_len_2 = pre_lane_len - lane_nd.shape[0]
        pre_lane_len = lane_nd.shape[0]
        polyline_id += 1

        assert _tmp_len_1 == _tmp_len_2, f"left, right lane vector length contradict"
        # lane_nd = np.vstack((lane_nd, l_lane_nd, r_lane_nd))

    # FIXME: handling `nan` in lane_nd
    col_mean = np.nanmean(lane_nd, axis=0)
    if np.isnan(col_mean).any():
        # raise ValueError(
        # print(f"{col_mean}\nall z (height) coordinates are `nan`!!!!")
        lane_nd[:, 2].fill(.0)
        lane_nd[:, 5].fill(.0)
    else:
        inds = np.where(np.isnan(lane_nd))
        lane_nd[inds] = np.take(col_mean, inds[1])

    # traj_ls, lane_ls = reconstract_polyline(
    #     np.vstack((traj_nd, lane_nd)), traj_id2mask, lane_id2mask, traj_nd.shape[0])
    # type_ = 'AGENT'
    # for traj in traj_ls:
    #     show_traj(traj, type_)
    #     type_ = 'OTHERS'

    # for lane in lane_ls:
    #     show_doubled_lane(lane)
    # plt.show()

    # transform gt to offset_gt
    offset_gt = trans_gt_offset_format(gt)

    # now the features are:
    # (xs, ys, xe, ye, obejct_type, timestamp(avg_for_start_end?),polyline_id) for object
    # (xs, ys, zs, xe, ye, ze, polyline_id) for lanes

    # change lanes feature to xs, ys, xe, ye, NULL, zs, ze, polyline_id)
    lane_nd = np.hstack(
        [lane_nd, np.zeros((lane_nd.shape[0], 1), dtype=lane_nd.dtype)])
    lane_nd = lane_nd[:, [0, 1, 3, 4, 7, 2, 5, 6]]
    # change object features to (xs, ys, xe, ye, timestamp, NULL, NULL, polyline_id)
    traj_nd = np.hstack(
        [traj_nd, np.zeros((traj_nd.shape[0], 2), dtype=traj_nd.dtype)])
    traj_nd = traj_nd[:, [0, 1, 2, 3, 5, 7, 8, 6]]

    # don't ignore the id
    polyline_features = np.vstack((traj_nd, lane_nd))
    data = [[polyline_features.astype(
        np.float32), offset_gt, traj_id2mask, lane_id2mask, traj_nd.shape[0], lane_nd.shape[0]]]

    return pd.DataFrame(
        data,
        columns=["POLYLINE_FEATURES", "GT",
                 "TRAJ_ID_TO_MASK", "LANE_ID_TO_MASK", "TARJ_LEN", "LANE_LEN"]
    )

In [159]:
polyline_id = 0
traj_id2mask, lane_id2mask = {}, {}
gt = agent_feature[-1] # last 3 seconds x,y coordinates
traj_nd, lane_nd = np.empty((0, 7)), np.empty((0, 7)) # returns the shape of array without initializing entries

In [166]:
print('gt shape: ', gt.shape)
print('traj_nd shape: ',traj_nd.shape)

gt shape:  (30, 2)
traj_nd shape:  (0, 7)


In [182]:
pre_traj_len = traj_nd.shape[0]
agent_len = agent_feature[0].shape[0]
# agent_feature[0] : xys, [xs, ys, xe, ye]
# agent_feature[2] : representative timestamp for each segment
agent_nd = np.hstack((agent_feature[0], np.ones((agent_len, 1)), agent_feature[2].reshape((-1, 1)), np.ones((agent_len, 1)) * polyline_id))
assert agent_nd.shape[1] == 7, "obj_traj feature dim 1 is not correct"

traj_nd = np.vstack((traj_nd, agent_nd))
traj_id2mask[polyline_id] = (pre_traj_len, traj_nd.shape[0])
pre_traj_len = traj_nd.shape[0]
polyline_id += 1

In [181]:
agent_nd

array([[ 0.87084125, 20.21932158,  0.87308075, 19.28813237,  1.        ,
         0.05001476,  0.        ],
       [ 0.87308075, 19.28813237,  0.86913281, 18.27315992,  1.        ,
         0.14828724,  0.        ],
       [ 0.86913281, 18.27315992,  0.82474593, 17.49473437,  1.        ,
         0.24652916,  0.        ],
       [ 0.82474593, 17.49473437,  0.76565395, 16.66880652,  1.        ,
         0.34751931,  0.        ],
       [ 0.76565395, 16.66880652,  0.76006989, 15.6344186 ,  1.        ,
         0.45051929,  0.        ],
       [ 0.76006989, 15.6344186 ,  0.70801954, 14.76012334,  1.        ,
         0.55247495,  0.        ],
       [ 0.70801954, 14.76012334,  0.66453049, 13.81127604,  1.        ,
         0.64971468,  0.        ],
       [ 0.66453049, 13.81127604,  0.61970006, 12.87656481,  1.        ,
         0.74834853,  0.        ],
       [ 0.61970006, 12.87656481,  0.56796328, 11.894135  ,  1.        ,
         0.84861705,  0.        ],
       [ 0.56796328, 11.8941

In [180]:
agent_nd.shape

(19, 7)

In [191]:
# encoding obj feature
# obj_feature_ls: a list of each object features, [[object1 features], [object2 features], ...]
for obj_feature in obj_feature_ls:
    obj_len = obj_feature[0].shape[0] # xys # of timesteps
    # if not obj_feature[2].shape[0] == obj_len:
    #     from pdb import set_trace;set_trace()
    obj_nd = np.hstack((obj_feature[0], np.zeros(
        (obj_len, 1)), obj_feature[2].reshape((-1, 1)), np.ones((obj_len, 1)) * polyline_id))
    assert obj_nd.shape[1] == 7, "obj_traj feature dim 1 is not correct"
    traj_nd = np.vstack((traj_nd, obj_nd))

    traj_id2mask[polyline_id] = (pre_traj_len, traj_nd.shape[0])
    pre_traj_len = traj_nd.shape[0]
    polyline_id += 1

# encoding lane feature
# lane_feature_ls: a list of lane features, [[lane1 features], [lane2 features], ...]
pre_lane_len = lane_nd.shape[0]
for lane_feature in lane_feature_ls:
    l_lane_len = lane_feature[0].shape[0]
    l_lane_nd = np.hstack(
        (lane_feature[0], np.ones((l_lane_len, 1)) * polyline_id))
    assert l_lane_nd.shape[1] == 7, "l_lane_traj feature dim 1 is not correct"
    lane_nd = np.vstack((lane_nd, l_lane_nd))
    lane_id2mask[polyline_id] = (pre_lane_len, lane_nd.shape[0])
    _tmp_len_1 = pre_lane_len - lane_nd.shape[0]
    pre_lane_len = lane_nd.shape[0]
    polyline_id += 1

    r_lane_len = lane_feature[1].shape[0]
    r_lane_nd = np.hstack(
        (lane_feature[1], np.ones((r_lane_len, 1)) * polyline_id)
    )
    assert r_lane_nd.shape[1] == 7, "r_lane_traj feature dim 1 is not correct"
    lane_nd = np.vstack((lane_nd, r_lane_nd))
    lane_id2mask[polyline_id] = (pre_lane_len, lane_nd.shape[0])
    _tmp_len_2 = pre_lane_len - lane_nd.shape[0]
    pre_lane_len = lane_nd.shape[0]
    polyline_id += 1

    assert _tmp_len_1 == _tmp_len_2, f"left, right lane vector length contradict"

In [223]:
print('hallucinated_lane_1 shape per lane_id: ', lane_feature_ls[0][0].shape)
print('total number of lane_ids: ', len(lane_feature_ls))
print('length of lane_nd should be: ', lane_feature_ls[0][0].shape[0] * len(lane_feature_ls) * 2) # 2 because hallucinated_lane_1 and hallucinated_lane_2

hallucinated_lane_1 shape per lane_id:  (9, 6)
total number of lane_ids:  27
length of lane_nd should be:  486


In [224]:
lane_nd.shape

(486, 7)

In [225]:
lane_nd

array([[-43.43728575,   7.99502789,          nan, ...,   8.07058635,
                 nan,   1.        ],
       [-41.79696723,   8.07058635,          nan, ...,   8.14614481,
                 nan,   1.        ],
       [-40.15664872,   8.14614481,          nan, ...,   8.22170327,
                 nan,   1.        ],
       ...,
       [-18.19594656,   5.27154567,          nan, ...,   5.35503996,
                 nan,  54.        ],
       [-16.48032636,   5.35503996,          nan, ...,   5.43853425,
                 nan,  54.        ],
       [-14.76470616,   5.43853425,          nan, ...,   5.52202854,
                 nan,  54.        ]])

In [229]:
col_mean = np.nanmean(lane_nd, axis=0)
if np.isnan(col_mean).any():
    lane_nd[:, 2].fill(.0)
    lane_nd[:, 5].fill(.0)
else:
    inds = np.where(np.isnan(lane_nd))
    lane_nd[inds] = np.take(col_mean, inds[1])

  col_mean = np.nanmean(lane_nd, axis=0)


In [227]:
col_mean

array([-2.05269328, 11.65601556,         nan, -1.88652533, 10.5182771 ,
               nan, 27.5       ])

In [230]:
lane_nd

array([[-43.43728575,   7.99502789,   0.        , ...,   8.07058635,
          0.        ,   1.        ],
       [-41.79696723,   8.07058635,   0.        , ...,   8.14614481,
          0.        ,   1.        ],
       [-40.15664872,   8.14614481,   0.        , ...,   8.22170327,
          0.        ,   1.        ],
       ...,
       [-18.19594656,   5.27154567,   0.        , ...,   5.35503996,
          0.        ,  54.        ],
       [-16.48032636,   5.35503996,   0.        , ...,   5.43853425,
          0.        ,  54.        ],
       [-14.76470616,   5.43853425,   0.        , ...,   5.52202854,
          0.        ,  54.        ]])

In [None]:
# trans_gt_offset_format function is explained below
offset_gt = trans_gt_offset_format(gt) # returns offsets. [ [gt[1] - gt[0]], [ gt[2] -g t[1] ], ...]

In [253]:
# now the features are:
# (xs, ys, xe, ye, obejct_type, timestamp(avg_for_start_end?),polyline_id) for object
# (xs, ys, zs, xe, ye, ze, polyline_id) for lanes

# change lanes feature to xs, ys, xe, ye, NULL, zs, ze, polyline_id)
lane_nd = np.hstack([lane_nd, np.zeros((lane_nd.shape[0], 1), dtype=lane_nd.dtype)])
lane_nd = lane_nd[:, [0, 1, 3, 4, 7, 2, 5, 6]]
# change object features to (xs, ys, xe, ye, timestamp, NULL, NULL, polyline_id)
traj_nd = np.hstack([traj_nd, np.zeros((traj_nd.shape[0], 2), dtype=traj_nd.dtype)])
traj_nd = traj_nd[:, [0, 1, 2, 3, 5, 7, 8, 6]]

# don't ignore the id
polyline_features = np.vstack((traj_nd, lane_nd))
data = [[polyline_features.astype(np.float32), offset_gt, traj_id2mask, lane_id2mask, traj_nd.shape[0], lane_nd.shape[0]]]

In [255]:
polyline_features

array([[  0.87084125,  20.21932158,   0.87308075, ...,   0.        ,
          0.        ,   0.        ],
       [  0.87308075,  19.28813237,   0.86913281, ...,   0.        ,
          0.        ,   0.        ],
       [  0.86913281,  18.27315992,   0.82474593, ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [-18.19594656,   5.27154567, -16.48032636, ...,   0.        ,
          0.        ,  54.        ],
       [-16.48032636,   5.35503996, -14.76470616, ...,   0.        ,
          0.        ,  54.        ],
       [-14.76470616,   5.43853425, -13.04908596, ...,   0.        ,
          0.        ,  54.        ]])

In [259]:
len(data[0])

6

### 2.1 trans_gt_offset_format function in feature_utils.py
```
trans_gt_offset_format(gt)
```
- this function is called inside encoding_features function
-  returns offset_gt which represents per-stepcoordinate offsets, starting from the last observed location. We rotate the coordinate system based on the heading of the target vehicle at the last observed location.

In [None]:
def trans_gt_offset_format(gt):
    """
    >Our predicted trajectories are parameterized as per-stepcoordinate offsets, starting from the last observed location.We rotate the coordinate system based on the heading of the target vehicle at the last observed location.
    
    """
    assert gt.shape == (30, 2) or gt.shape == (0, 2), f"{gt.shape} is wrong"

    # for test, no gt, just return a (0, 2) ndarray
    if gt.shape == (0, 2):
        return gt

    offset_gt = np.vstack((gt[0], gt[1:] - gt[:-1]))
    # import pdb
    # pdb.set_trace()
    assert (offset_gt.cumsum(axis=0) -
            gt).sum() < 1e-6, f"{(offset_gt.cumsum(axis=0) -gt).sum()}"

    return offset_gt

In [248]:
gt

array([[-1.41191360e-01, -7.25470306e-01],
       [-1.94483338e-01, -2.09854082e+00],
       [-2.63776742e-01, -3.21874101e+00],
       [-2.95501797e-01, -4.53486013e+00],
       [-3.16634870e-01, -5.82386836e+00],
       [-3.68211708e-01, -7.21845016e+00],
       [-3.39067141e-01, -8.55146026e+00],
       [-3.58316458e-01, -9.95174055e+00],
       [-3.40818869e-01, -1.14272523e+01],
       [-3.53501353e-01, -1.29405419e+01],
       [-3.22657885e-01, -1.43041238e+01],
       [-2.88234570e-01, -1.57339501e+01],
       [-3.01689465e-01, -1.72973049e+01],
       [-2.70060718e-01, -1.88551640e+01],
       [-2.29726641e-01, -2.03349161e+01],
       [-1.84780898e-01, -2.18480218e+01],
       [-1.61741405e-01, -2.34479923e+01],
       [-1.13788993e-01, -2.49812366e+01],
       [-4.65114205e-02, -2.65540997e+01],
       [ 1.25489165e-03, -2.80560183e+01],
       [ 1.64995721e-02, -2.96354387e+01],
       [ 8.82846529e-02, -3.13419149e+01],
       [ 1.62935407e-01, -3.30333472e+01],
       [ 2.

In [249]:
offset_gt = np.vstack((gt[0], gt[1:] - gt[:-1]))
offset_gt

array([[-0.14119136, -0.72547031],
       [-0.05329198, -1.37307051],
       [-0.0692934 , -1.12020019],
       [-0.03172505, -1.31611912],
       [-0.02113307, -1.28900823],
       [-0.05157684, -1.3945818 ],
       [ 0.02914457, -1.3330101 ],
       [-0.01924932, -1.40028029],
       [ 0.01749759, -1.47551175],
       [-0.01268248, -1.51328964],
       [ 0.03084347, -1.36358189],
       [ 0.03442331, -1.4298263 ],
       [-0.01345489, -1.56335479],
       [ 0.03162875, -1.55785907],
       [ 0.04033408, -1.47975212],
       [ 0.04494574, -1.51310568],
       [ 0.02303949, -1.59997052],
       [ 0.04795241, -1.53324427],
       [ 0.06727757, -1.57286311],
       [ 0.04776631, -1.50191862],
       [ 0.01524468, -1.57942041],
       [ 0.07178508, -1.7064762 ],
       [ 0.07465075, -1.69143224],
       [ 0.06149034, -1.50239741],
       [ 0.04606056, -1.68129941],
       [ 0.05104938, -1.55928098],
       [ 0.06366017, -1.65998015],
       [ 0.08116147, -1.76263002],
       [ 0.05207737,

In [252]:
(offset_gt.cumsum(axis=0) - gt).sum()

0.0

## 3. save_features in feature_utils.py function explanation
```
save_features(feature_df, name, dir_=None)
```

- It is called inside the preprocess_data.py
- saves the feature_df into .pkl
- Last step of preprocessing data
- These preprocessed data (.pkl files) are later used in GraphDataset object in dataset.py to convert into tensors and being batchfied for model feeding

In [None]:
def save_features(df, name, dir_=None):
    if dir_ is None:
        dir_ = './input_data'
    if not os.path.exists(dir_):
        os.makedirs(dir_)

    name = f"features_{name}.pkl"
    df.to_pickle(
        os.path.join(dir_, name)
    )