In [7]:
import os
import pandas
import osmnx as ox
from pathlib import Path
import geopandas as gpd
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

In [8]:
# Set up modes and dirs
databricks = False
overwrite = False
is_yellow = False
yellow = "yellow" if is_yellow else "foil"
pick_up = "pickup"
drop_off = "dropoff"

# %%

if not databricks:
    data_dir = "/Users/kzmain/LSDE/data"
    spark = SparkSession.builder.getOrCreate()
else:
    data_dir = "/dbfs/mnt/group01"

fr_file = fr_dbfs = (data_dir + "/{}".format(yellow) + "/raw/{}/{}.gz.parquet")
to_file = to_dbfs = (data_dir + "/{}".format(yellow) + "/cln/{}/{}.gz.parquet")

if databricks:
    fr_dbfs = fr_dbfs.replace("/dbfs", "")
    to_dbfs = to_dbfs.replace("/dbfs", "")

# %%

if is_yellow:
    start_year = 2009
    end_year = 2016
else:
    start_year = 2010
    end_year = 2013

# %%

year_range = range(start_year, end_year + 1)
month_range = range(1, 13)


# %%

def check_file_exist(_path):
    if os.path.exists(_path) and not overwrite:
        print("[SYSTEM]: File exists: {}".format(_path))
        return True
    else:
        return False


# %% Data comes from osmnx map

w_lon = -74.2463  # left bound
e_lon = -73.7141  # right bound
n_lat = 40.9166  # up bound
s_lat = 40.4767  # down bound

drop_lon = "dropoff_longitude"
drop_lat = "dropoff_latitude"
pick_lon = "pickup_longitude"
pick_lat = "pickup_latitude"

# %%

shp_path = os.path.join(data_dir, "nyc/zone")
map_path = os.path.join(data_dir, "nyc/map/NYC.mph")
# nyc_shp = gpd.read_file(shp_path)
# nyc_shp = nyc_shp.to_crs(epsg=4326)
# nyc_map = ox.load_graphml(map_path)


# %%

def get_file_list(_path):
    _parquet_file_list = []
    for _root, _dirs, _files in os.walk(_path, topdown=False):
        for _name in _files:
            _file_name = os.path.join(_root, _name)
            if Path(_file_name).suffix == '.parquet':
                _parquet_file_list.append(_file_name)
    return _parquet_file_list

In [9]:
year = 2010
month =1
# Get file location of the parquet
raw_file_path = fr_file.format(year, month)
raw_file_dbfs = fr_dbfs.format(year, month)

# Read in parquet file by year-month
_file_list = get_file_list(raw_file_path)
_full_df = pandas.concat(pandas.read_parquet(_parquet_file) for _parquet_file in _file_list)

In [10]:
_full_df = _full_df.round({drop_lat: 4, drop_lon: 4, pick_lat: 4, pick_lon: 4})

In [13]:
_full_df.dtypes

medallion                     int32
hack_license                  int32
pickup_datetime      datetime64[ns]
dropoff_latitude            float64
dropoff_longitude           float64
pickup_latitude             float64
pickup_longitude            float64
trip_distance               float64
trip_time_in_secs           float64
dropoff_datetime     datetime64[ns]
rate_code                     int16
tip_amount                  float64
total_amount                float64
dtype: object

In [15]:
_full_df["duration"] = _full_df["dropoff_datetime"] - _full_df["pickup_datetime"]

In [20]:
import numpy as np
_full_df["dd"] = _full_df['duration'] / np.timedelta64(1, 's')

In [21]:
_full_df

Unnamed: 0,medallion,hack_license,pickup_datetime,dropoff_latitude,dropoff_longitude,pickup_latitude,pickup_longitude,trip_distance,trip_time_in_secs,dropoff_datetime,rate_code,tip_amount,total_amount,duration,dd
0,2010000001,2010000001,2010-01-21 10:18:00,40.7176,-74.0129,40.6486,-73.7841,20.90,52.0,2010-01-21 11:10:00,1,9.0,59.07,0 days 00:52:00,3120.0
1,2010000001,2010000001,2010-01-26 13:20:00,40.7595,-73.9767,40.7746,-73.9539,2.07,10.0,2010-01-26 13:30:00,1,3.0,12.10,0 days 00:10:00,600.0
2,2010000001,2010016645,2010-01-05 05:35:00,40.7072,-74.0046,40.7465,-73.9722,4.15,11.0,2010-01-05 05:46:00,1,0.0,12.60,0 days 00:11:00,660.0
3,2010000001,2010016645,2010-01-12 02:43:00,40.7376,-74.0004,40.7469,-74.0024,0.90,7.0,2010-01-12 02:50:00,1,0.0,5.80,0 days 00:07:00,420.0
4,2010000002,2010000002,2010-01-07 15:02:00,40.6837,-73.7796,40.7015,-73.7761,5.02,21.0,2010-01-07 15:23:00,1,0.0,16.30,0 days 00:21:00,1260.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74301,2010013327,2010011183,2010-01-31 05:41:00,40.7523,-73.9749,40.7674,-73.9594,1.53,7.0,2010-01-31 05:48:00,1,0.0,7.00,0 days 00:07:00,420.0
74302,2010013331,2010030427,2010-01-31 04:20:53,40.7505,-73.9913,40.7598,-73.9805,0.60,5.0,2010-01-31 04:26:22,1,0.0,5.00,0 days 00:05:29,329.0
74303,2010013332,2010030430,2010-01-28 07:22:14,40.7313,-74.0068,40.7614,-73.9914,1.30,47.0,2010-01-28 08:09:37,1,0.0,45.50,0 days 00:47:23,2843.0
74304,2010013335,2010023022,2010-01-31 03:37:15,40.7985,-73.9532,40.7866,-73.9683,1.40,5.0,2010-01-31 03:42:25,1,1.5,7.70,0 days 00:05:10,310.0


In [22]:
_full_df.dtypes

medallion                      int32
hack_license                   int32
pickup_datetime       datetime64[ns]
dropoff_latitude             float64
dropoff_longitude            float64
pickup_latitude              float64
pickup_longitude             float64
trip_distance                float64
trip_time_in_secs            float64
dropoff_datetime      datetime64[ns]
rate_code                      int16
tip_amount                   float64
total_amount                 float64
duration             timedelta64[ns]
dd                           float64
dtype: object