In [1]:
from datatable import dt, f, by, as_type
from datetime import datetime as d
from shapely.geometry import Point, shape, Polygon
import os, shutil, glob
import pandas as pd
import matplotlib.pyplot as plt
import geopandas
import warnings
warnings.filterwarnings('ignore')




In [2]:
fold = '/BigData/BigData/HW1/*'
file = glob.glob(f'{fold}')

In [3]:
frame = dt.Frame()
[frame.rbind(dt.fread(f, fill=True), force=True, bynames=True) for f in file]
columns = frame.names

In [4]:
print(f'Number of features:{frame.ncols}')
print(f'Number of identity:{frame.nrows}')

Number of features:18
Number of identity:41859909


In [5]:
'''
Delete all NA row column and row
'''
del frame[:, 'store_and_forward']
del frame[:, 'Rate_Code']
del frame[:, 'mta_tax']
del frame[dt.rowall(f[:]==None),:]

In [6]:
#convert DateTime to Hour (0~23)
frame[:,dt.update(Trip_Pickup_DateTime = dt.time.hour(f.Trip_Pickup_DateTime), \
                    Trip_Dropoff_DateTime = dt.time.hour(f.Trip_Dropoff_DateTime))]
frame

Unnamed: 0_level_0,vendor_name,Trip_Pickup_DateTime,Trip_Dropoff_DateTime,Passenger_Count,Trip_Distance,Start_Lon,Start_Lat,End_Lon,End_Lat,Payment_Type,Fare_Amt,surcharge,Tip_Amt,Tolls_Amt,Total_Amt
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,CMT,15,15,1,0.3,−73.9707,40.7964,−73.9736,40.7921,Cash,4.1,0,0,0,4.1
1,CMT,0,0,1,1.6,−74.0073,40.74,−74.0047,40.7519,Cash,7,0,0,0,7
2,DDS,19,20,1,2,−73.9764,40.7567,−73.9544,40.7672,CASH,8.1,0.5,0,0,8.6
3,CMT,22,22,3,1.6,−73.9906,40.7515,−74.0084,40.7466,Cash,7,0,0,0,7
4,CMT,22,23,2,3.8,−73.9938,40.7342,−73.9588,40.7699,Cash,12.6,0,0,0,12.6
5,CMT,1,1,1,0.7,−73.9674,40.7809,−73.9681,40.7719,Cash,5.4,0,0,0,5.4
6,CMT,22,22,1,0,−73.9616,40.7565,−73.9616,40.7566,No Charge,3,0,0,0,3
7,CMT,7,7,1,0.9,−73.9338,40.854,−73.9396,40.8414,Cash,4.5,0,0,0,4.5
8,CMT,13,13,1,1.1,−73.9645,40.7704,−73.9819,40.7737,Cash,5.7,0,0,0,5.7
9,CMT,21,21,1,2.5,−73.9673,40.7635,−73.9846,40.7326,Cash,8.6,0,0,0,8.6


In [7]:
# A function map Lon and Lat to Region
from pyproj import CRS
crs = CRS('epsg:4326')
def cor2loc(df, lon_var, lat_var, name_var):
    local_df = df.copy()
    shape_df = geopandas.read_file('taxi_zones/taxi_zones.shp')
    shape_df.drop(['OBJECTID', 'Shape_Area', 'Shape_Leng'], axis=1, inplace=True)
    shape_df = shape_df.to_crs(crs)
    
    try:
        local_gdf = geopandas.GeoDataFrame(local_df, crs=crs,
                                          geometry=[Point(xy) for xy in zip(local_df[lon_var], local_df[lat_var])])
        local_gdf = geopandas.sjoin(local_gdf, shape_df, how='left', op='within')
        local_gdf = local_gdf.drop('geometry', axis=1)
        local_gdf = local_gdf.drop('index_right', axis=1)
        local_gdf.rename(columns={'zone':f'{name_var}_Zone', 'LocationID':f'{name_var}_ID', 'borough':f'{name_var}_borough'}, inplace=True)
        return local_gdf
    except ValueError as ve:
        print(ve)
        print(ve.stacktrace())
        series = df[lon_var]
        return series

In [8]:
# Partition dataframe to multiple chunk
num_of_pool = 32
frame = frame.to_pandas()
n = frame.shape[0] // num_of_pool
list_frame = [frame[i:i+n] for i in range(0, frame.shape[0],n)]

In [9]:
# Multiprocess mapping due to big data
from multiprocessing import Pool
from itertools import repeat
def multiple_mapping(num_of_pool, list_df,lon_var, lat_var, name_var):
    with Pool(num_of_pool) as pool:
        second_arg = lon_var
        third_arg = lat_var
        forth_arg = name_var
        new_list = pool.starmap(cor2loc, zip(list_df, repeat(second_arg), repeat(third_arg), repeat(forth_arg)))
    return new_list

In [10]:
list_frame = multiple_mapping(num_of_pool, list_frame, 'Start_Lon', 'Start_Lat', 'pick_up')
list_frame = multiple_mapping(num_of_pool, list_frame, 'End_Lon', 'End_Lat', 'drop_off')

In [11]:
# Save mapping result.
new_frame = pd.concat(list_frame)
new_frame.to_csv('Finish_mapping.csv')