In [1]:
import pandas as pd
import numpy as np
import os.path

First of all, we create the list of column names that will be used in the final dataset.

In [2]:
cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID']

Then, we load the dataset that contains the columns specified above.

In [17]:
data_iter = pd.read_csv('../resources/2018-taxi-trip-data.csv', usecols = cols, iterator = True, chunksize = 20000)

Finally, we do the data processing by:
- renaming the columns.
- changing both the pickup and the dropoff day of the week to trigonometric values that represent it. [Here is the article that describes that transformation.](https://www.mikulskibartosz.name/time-in-machine-learning/)
- changing pickup and dropofff time the same way we did with days of week

DataFrame is saved to a csv file, so that it can be used in other scripts

In [18]:
for df in data_iter:
    df.rename(columns = {'PULocationID': 'pickup_location', 'DOLocationID': 'dropoff_location'}, inplace = True)

    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

    df['day_of_week_pickup_sin'] = np.sin(df['tpep_pickup_datetime'].dt.dayofweek * (2 * np.pi / 7))
    df['day_of_week_pickup_cos'] = np.cos(df['tpep_pickup_datetime'].dt.dayofweek * (2 * np.pi / 7))

    df['day_of_week_dropoff_sin'] = np.sin(df['tpep_dropoff_datetime'].dt.dayofweek * (2 * np.pi / 7))
    df['day_of_week_dropoff_cos'] = np.cos(df['tpep_dropoff_datetime'].dt.dayofweek * (2 * np.pi / 7))

    df['pickup_time_sin'] = np.sin(2 * np.pi * df['tpep_pickup_datetime'].dt.hour/24.0)
    df['pickup_time_cos'] = np.cos(2 * np.pi * df['tpep_pickup_datetime'].dt.hour/24.0)

    df['dropoff_time_sin'] = np.sin(2 * np.pi * df['tpep_dropoff_datetime'].dt.hour/24.0)
    df['dropoff_time_cos'] = np.cos(2 * np.pi * df['tpep_dropoff_datetime'].dt.hour/24.0)

    df.drop(columns = ['tpep_pickup_datetime', 'tpep_dropoff_datetime'], inplace = True)

    if not os.path.exists('../processed_data/2018-taxi-trip-data-clean.csv'):
        df.to_csv('../processed_data/2018-taxi-trip-data-clean.csv', index = False)
        continue

    df.to_csv('../processed_data/2018-taxi-trip-data-clean.csv', mode = 'a', index = False, header = False)