In [1]:
import pandas as pd
import os.path

Loading taxi and road lengths datasets

In [2]:
taxi_data = pd.read_csv('2018_Yellow_Taxi_Trip_Data_Clean.csv', iterator=True, chunksize=10000)
road_lengths = pd.read_csv('roads_length.csv')

Creating empy dataframes, which will contain pickup and dropoff counts for each zone at unique day of week and pickup/dropoff time

In [3]:
taxi_pickup_count = pd.DataFrame(columns=['pickup_location', 'day_of_week_pickup_sin', 'day_of_week_pickup_cos', 'pickup_time_sin', 'pickup_time_cos', 'counts'])
taxi_dropoff_count = pd.DataFrame(columns=['dropoff_location', 'day_of_week_dropoff_sin', 'day_of_week_dropoff_cos', 'dropoff_time_sin', 'dropoff_time_cos', 'counts'])

Grouping rows from a taxi data and summing pickup/dropoff counts

In [4]:
for df in taxi_data:
    taxi_pickup_count = pd.concat([taxi_pickup_count, df.groupby(['pickup_location', 'day_of_week_pickup_sin', 'day_of_week_pickup_cos', 'pickup_time_sin', 'pickup_time_cos']).size().reset_index(name='counts')])
    taxi_dropoff_count = pd.concat([taxi_dropoff_count, df.groupby(['dropoff_location', 'day_of_week_dropoff_sin', 'day_of_week_dropoff_cos', 'dropoff_time_sin', 'dropoff_time_cos']).size().reset_index(name='counts')])

Getting rid of duplicated rows by again summing their pickup/dropoff counts

In [5]:
taxi_pickup_count = taxi_pickup_count.groupby(['pickup_location', 'day_of_week_pickup_sin', 'day_of_week_pickup_cos', 'pickup_time_sin', 'pickup_time_cos'], as_index=False).sum()
taxi_dropoff_count = taxi_dropoff_count.groupby(['dropoff_location', 'day_of_week_dropoff_sin', 'day_of_week_dropoff_cos', 'dropoff_time_sin', 'dropoff_time_cos'], as_index=False).sum()

Merging dataframes with road lenghts

In [6]:
taxi_pickup_count = pd.merge(taxi_pickup_count, road_lengths, left_on='pickup_location', right_on='zone_id')
taxi_dropoff_count = pd.merge(taxi_dropoff_count, road_lengths, left_on='dropoff_location', right_on='zone_id')

Counting probability of pickup/dropoff for each zone at a given time and day of week, by dividing a number of pickups/dropoffs at a given period of time by sum of road lenghts in a choosen zone

In [7]:
taxi_pickup_count['probability_pickup'] = taxi_pickup_count.apply(lambda x: x['counts'] / x['road_length'], axis = 1)
taxi_dropoff_count['probability_dropoff'] = taxi_dropoff_count.apply(lambda x: x['counts'] / x['road_length'], axis = 1)

Getting rid of redundant columns

In [8]:
taxi_pickup_count.drop(columns = ['counts', 'road_length', 'zone_id'], inplace=True)
taxi_dropoff_count.drop(columns = ['counts', 'road_length', 'zone_id'], inplace=True)

Saving both dataframes to csv files, so that they can be used in other scripts

In [9]:
taxi_pickup_count.to_csv('taxi_pickup_prob.csv', index = False)
taxi_dropoff_count.to_csv('taxi_dropoff_prob.csv', index = False)