In [44]:
#!pip install tqdm
#!conda install shapely descartes geopandas -y

Solving environment: done


  current version: 4.5.12
  latest version: 4.6.14

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



In [11]:
# general tools
import warnings
import requests
import pickle
import math
import re

# visualization tools
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import seaborn as sns

# data preprocessing tools
import pandas as pd
from shapely.geometry import Point
import numpy as np
from scipy.spatial.distance import cdist


tqdm.pandas()
plt.style.use('seaborn')
warnings.filterwarnings("ignore")

#%run ../src/utils.py

In [12]:
traffic = pd.read_csv('../data/external/Traffic_Published_2016.csv')
traffic.shape

(94258, 16)

In [13]:
traffic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94258 entries, 0 to 94257
Data columns (total 16 columns):
ROUTE_ID                94258 non-null object
FROM_MILEPOINT          94258 non-null float64
TO_MILEPOINT            94258 non-null float64
COUNTY_COD              91713 non-null float64
COUNTY_NAME             91713 non-null object
TC_NUMBER               41239 non-null object
AADT                    94258 non-null int64
AADT_SINGLE_UNIT        44070 non-null float64
PCT_PEAK_SINGLE         42724 non-null float64
AADT_COMBINATION        43403 non-null float64
PCT_PEAK_COMBINATION    37597 non-null float64
K_FACTOR                83781 non-null float64
D_Factor                41095 non-null float64
FUTURE_AADT             94239 non-null float64
Lat                     40984 non-null float64
Long                    40984 non-null float64
dtypes: float64(12), int64(1), object(3)
memory usage: 11.5+ MB


In [14]:
traffic = traffic.dropna(subset=['Lat'])
traffic.shape

(40984, 16)

In [15]:
train = pd.read_csv('../data/raw/data_train.zip', index_col='Unnamed: 0', low_memory=True)
test = pd.read_csv('../data/raw/data_test.zip', index_col='Unnamed: 0', low_memory=True)

train.shape, test.shape

((814262, 11), (202937, 11))

In [16]:
data = pd.concat([train, test], axis=0)

data.shape

(1017199, 11)

In [17]:
import pyproj

converter = pyproj.Proj("+proj=merc +lat_ts=0 +lat_0=0 +lon_0=0 +x_0=0 \
                    +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs")

data['lat_lon_entry'] = [converter(x, y, inverse=True) for x, y in zip(data.x_entry, data.y_entry)]

data['lat_entry'] = data.lat_lon_entry.apply(lambda row: row[0])
data['lon_entry'] = data.lat_lon_entry.apply(lambda row: row[1])

data['lat_lon_exit'] = [converter(x, y, inverse=True) for x, y in zip(data.x_exit, data.y_exit)]

data['lat_exit'] = data.lat_lon_exit.apply(lambda row: row[0])
data['lon_exit'] = data.lat_lon_exit.apply(lambda row: row[1])

In [18]:
def euclidean(x_one, y_one, x_two, y_two):
    """Distance as defined by the Euclidean formula for the ((x1, y1), (x2, y2)) case."""
    return np.sqrt(np.power((x_one-x_two), 2) + np.power((y_one-y_two), 2))

data['euclidean_distance'] = euclidean(data.x_entry.values, data.y_entry.values,
                                      data.x_exit.values, data.y_exit.values)

In [19]:
from math import hypot
from scipy.spatial.distance import cdist
from tqdm import tqdm

traffic = traffic.reset_index(drop=True)
coords_traff = list(zip(traffic.Lat.values, traffic.Long.values))
data['idx_traffic'] = np.zeros(data.shape[0])

df_copy = data.copy()
df_copy = df_copy[df_copy.euclidean_distance!=0]
df_copy = df_copy.reset_index(drop=True)

def minimum_distance(data, row_type='entry'):
    for idx, (lat, long) in tqdm(enumerate(list(zip(data['lat_'+row_type].values, data['lon_'+row_type].values)))):
        minimum_dist = 0

        idx_traffic = cdist([(lat, long)], coords_traff).argmin()
        data.loc[idx, 'idx_traffic'] = idx_traffic
        
    return data

df_copy = minimum_distance(df_copy, row_type='exit')

491966it [1:40:30, 81.58it/s]


In [21]:
df_copy['idx_traffic'] = df_copy.idx_traffic.astype(int)

In [22]:
df_copy.head(4)

Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit,lat_lon_entry,lat_entry,lon_entry,lat_lon_exit,lat_exit,lon_exit,euclidean_distance,idx_traffic
0,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_0,07:04:31,07:08:32,,,,3751014.0,-19093980.0,3750326.0,-19136340.0,"(33.695930000000004, -84.3014476969898)",33.69593,-84.301448,"(33.689750000000004, -84.33910917018206)",33.68975,-84.339109,42364.863118,17316
1,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_1,07:20:34,07:25:42,,,,3743937.0,-19322470.0,3744975.0,-19319660.0,"(33.63236000000001, -84.50167564792125)",33.63236,-84.501676,"(33.64167999999999, -84.49926159813707)",33.64168,-84.499262,2989.643628,39609
2,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_2,07:53:32,08:03:25,,,,3744868.0,-19293560.0,3744816.0,-19292840.0,"(33.640719999999995, -84.47674038855256)",33.64072,-84.47674,"(33.640250000000016, -84.47611452311658)",33.64025,-84.476115,725.658561,21659
3,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_3,08:17:50,08:37:23,,,,3744880.0,-19292290.0,3744809.0,-19290490.0,"(33.640829999999994, -84.47563767324873)",33.64083,-84.475638,"(33.640190000000004, -84.47407797673429)",33.64019,-84.474078,1804.576849,22050


In [27]:
traffic_cols = traffic.columns.tolist()

traffic = traffic.reset_index(drop=False)
#traffic.columns = ['idx_traffic']+[traffic_cols]

df_copy['index'] = df_copy.idx_traffic.values

df_final = df_copy.merge(traffic, on='index')
df_final.head(4)

Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,...,AADT,AADT_SINGLE_UNIT,PCT_PEAK_SINGLE,AADT_COMBINATION,PCT_PEAK_COMBINATION,K_FACTOR,D_Factor,FUTURE_AADT,Lat,Long
0,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_0,07:04:31,07:08:32,,,,3751014.0,-19093980.0,3750326.0,...,370,,,,,,100.0,440.0,33.6943,-84.3442
1,004f2f3b69825c9a91ffcfb2d41bd5bc_31,traj_004f2f3b69825c9a91ffcfb2d41bd5bc_31_1,02:29:10,02:35:35,,,,3753847.0,-19149530.0,3750332.0,...,370,,,,,,100.0,440.0,33.6943,-84.3442
2,053952433b521f11fa16cffdc99d7e1e_9,traj_053952433b521f11fa16cffdc99d7e1e_9_4,08:26:31,09:35:34,20.409084,20.409084,20.409084,3754351.0,-19143910.0,3751110.0,...,370,,,,,,100.0,440.0,33.6943,-84.3442
3,053952433b521f11fa16cffdc99d7e1e_9,traj_053952433b521f11fa16cffdc99d7e1e_9_7,11:26:05,11:31:00,,,,3749769.0,-19146680.0,3751115.0,...,370,,,,,,100.0,440.0,33.6943,-84.3442


['hash',
 'trajectory_id',
 'AADT_exit',
 'FUTURE_AADT_exit',
 'PCT_PEAK_SINGLE_exit',
 'COUNTY_COD_exit',
 'COUNTY_NAME_exit',
 'Long_exit',
 'AADT_COMBINATION_exit',
 'K_FACTOR_exit',
 'FROM_MILEPOINT_exit',
 'TC_NUMBER_exit',
 'Lat_exit',
 'ROUTE_ID_exit',
 'PCT_PEAK_COMBINATION_exit',
 'TO_MILEPOINT_exit',
 'AADT_SINGLE_UNIT_exit',
 'D_Factor_exit']

In [38]:
final_columns = list(set(traffic.columns.tolist()) - set(['level_0', 'index']))
final_columns += ['hash', 'trajectory_id']

for col in final_columns:
    if col not in ['hash', 'trajectory_id']:
        df_final = df_final.rename(index=str, columns={col: col+'_exit'})

df_final.head(4)

Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,...,AADT_exit,AADT_SINGLE_UNIT_exit,PCT_PEAK_SINGLE_exit,AADT_COMBINATION_exit,PCT_PEAK_COMBINATION_exit,K_FACTOR_exit,D_Factor_exit,FUTURE_AADT_exit,Lat_exit,Long_exit
0,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_0,07:04:31,07:08:32,,,,3751014.0,-19093980.0,3750326.0,...,370,,,,,,100.0,440.0,33.6943,-84.3442
1,004f2f3b69825c9a91ffcfb2d41bd5bc_31,traj_004f2f3b69825c9a91ffcfb2d41bd5bc_31_1,02:29:10,02:35:35,,,,3753847.0,-19149530.0,3750332.0,...,370,,,,,,100.0,440.0,33.6943,-84.3442
2,053952433b521f11fa16cffdc99d7e1e_9,traj_053952433b521f11fa16cffdc99d7e1e_9_4,08:26:31,09:35:34,20.409084,20.409084,20.409084,3754351.0,-19143910.0,3751110.0,...,370,,,,,,100.0,440.0,33.6943,-84.3442
3,053952433b521f11fa16cffdc99d7e1e_9,traj_053952433b521f11fa16cffdc99d7e1e_9_7,11:26:05,11:31:00,,,,3749769.0,-19146680.0,3751115.0,...,370,,,,,,100.0,440.0,33.6943,-84.3442


In [39]:
final_columns = ['hash', 'trajectory_id'] + [col+'_exit' for col in final_columns if col not in ['hash', 'trajectory_id']]

In [41]:
df_final[final_columns].head(4)

df_final = df_final.drop('COUNTY_NAME_exit', axis=1)

In [43]:
final_columns = list(set(final_columns) - set(['COUNTY_NAME_exit']))

In [44]:
df_final[final_columns].to_hdf('../data/external/traffic_exit_features.hdf', key='exit', mode='w')