In [1]:
import warnings
import os

import pandas as pd
import numpy as np

from scipy.stats import binned_statistic_2d
from tqdm import tqdm

warnings.filterwarnings('ignore')
PATH_TO_DATA = '../data'

In [11]:
# bottom x left     and    top x right
longitude_bottom = -74.25559
longitude_top = -73.70001

latitude_left = 40.49612
latitude_right = 40.91553

def clean_data(df):
    pred = (df['passenger_count'] != 0) & (df['trip_distance'] != 0) & \
    (df['tpep_pickup_datetime'] != df['tpep_dropoff_datetime'])  & (df['pickup_longitude'] >= longitude_bottom) & \
    (df['pickup_longitude'] <= longitude_top)  & (df['pickup_latitude'] >= latitude_left) & \
    (df['pickup_latitude'] <= latitude_right)
    
    df = df[pred]
    df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].apply(lambda x: x.replace(minute=0, second=0))
    
    return df

In [16]:
def aggregate_df(df):
    binx=np.sort(regions.west.unique())[1:]
    biny=np.sort(regions.south.unique())[1:]

    x = df.pickup_longitude.values
    y = df.pickup_latitude.values

    _, _, _, regions_ids = binned_statistic_2d(x, y, regions, 'count', bins=[binx, biny])

    regions_ids = regions_ids + 1
    
    df['region'] = regions_ids
    
    all_times = df['tpep_pickup_datetime'].view('int64')

    binx = regions.index.values.tolist() + [2501]
    biny = np.sort(all_times.unique()).tolist() + [np.datetime64('2016-07-01T00:00:00.000000000').view('int64')]

    x = df.region.values
    y = all_times

    matrix, _, _, _ = binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
    
    aggregated_df = pd.DataFrame(matrix, index=regions.index.values, columns= np.sort(df['tpep_pickup_datetime'].unique()))
    
    return aggregated_df

In [13]:
regions = pd.read_csv(os.path.join(PATH_TO_DATA, 'regions.csv'), index_col=0, delimiter=';')

In [17]:
dates = ['tpep_pickup_datetime', 'tpep_dropoff_datetime']
columns = ['passenger_count', 'trip_distance', 'pickup_longitude', 'pickup_latitude'] + dates
monthes = [1, 2, 3, 4, 6]
for i in tqdm(monthes):
    file = 'yellow_tripdata_2016-0%d.csv' % i
    data = pd.read_csv(os.path.join(PATH_TO_DATA, file), parse_dates=dates, usecols=columns)
    data = clean_data(data)
    data = aggregate_df(data)
    data.to_csv(os.path.join(PATH_TO_DATA, 'aggregated_'+ file))


  0%|                                                                                                           | 0/1 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [02:50<00:00, 170.31s/it]


In [26]:
data = pd.read_csv(os.path.join(PATH_TO_DATA, 'aggregated_yellow_tripdata_2016-01.csv'), index_col=0)

for i in tqdm(range(1, 6)):
    file = 'aggregated_yellow_tripdata_2016-0%d.csv' % i
    df = pd.read_csv(os.path.join(PATH_TO_DATA, file), index_col=0)
    data = pd.concat([data, df], axis=1)
    
data.to_csv(os.path.join(PATH_TO_DATA, 'train.csv'))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.67it/s]


In [50]:
data = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'), index_col=0)
region = data.mean(axis=1).sort_values(ascending=False).index[0]
train_df = data.loc[[region]].T
train_df.to_csv(os.path.join(PATH_TO_DATA, 'train_%d.csv' % region))

data = pd.read_csv(os.path.join(PATH_TO_DATA, 'test.csv'), index_col=0)
test_df = data.loc[[region]].T
test_df.to_csv(os.path.join(PATH_TO_DATA, 'test_%d.csv' % region))