In [1]:
import dask.dataframe as dd
# using dask here for faster data analysis and preprocessing
# similar to pandas
# comparison, further references: https://medium.com/featurepreneur/pandas-vs-dask-the-power-of-parallel-computing-994a202a74bd

import pandas as pd
# still using pandas at some point for smaller dataframes where speed is not so relevant

import pyarrow.parquet as pq
# pyarrow for reading large parquet file
import datetime

import time

import numpy as np


# pip install fastparquet for reading parquet files

import os
# os for working with different files in one directory and print information about files

import warnings
# this lib ist helping to ignore some warnings if you load dataframes for example
warnings.filterwarnings("ignore")

In [2]:
# We Have collected all yellow taxi trips data from jan-2019 to dec-2019
# Summary of the 12 files in parquet format
df_file_summary = pd.DataFrame()
for filename in os.listdir('../data/raw/parquet/'):
    if filename.endswith(".parquet"):
        df_month_data = pq.read_table('../data/raw/parquet/' + os.path.join(filename), columns=[])
        size = str(round(os.stat('../data/raw/parquet/' + os.path.join(filename)).st_size / 1024 / 1024, 2)) + ' mb'
        new_row = {
            'file_name': filename,
            'file_size': size,
            'number_of_records': df_month_data.num_rows,
        }
        df_file_summary = df_file_summary.append(new_row, ignore_index=True)
print(df_file_summary)

                          file_name  file_size  number_of_records
0   yellow_tripdata_2019-01.parquet  105.32 mb            7696617
1   yellow_tripdata_2019-02.parquet   98.57 mb            7049370
2   yellow_tripdata_2019-03.parquet  110.64 mb            7866620
3   yellow_tripdata_2019-04.parquet  105.04 mb            7475949
4   yellow_tripdata_2019-05.parquet  106.31 mb            7598445
5   yellow_tripdata_2019-06.parquet   98.14 mb            6971560
6   yellow_tripdata_2019-07.parquet   89.53 mb            6310419
7   yellow_tripdata_2019-08.parquet   85.83 mb            6073357
8   yellow_tripdata_2019-09.parquet   92.61 mb            6567788
9   yellow_tripdata_2019-10.parquet  101.37 mb            7213891
10  yellow_tripdata_2019-11.parquet    96.2 mb            6878111
11  yellow_tripdata_2019-12.parquet   96.36 mb            6896317


In [None]:
# For data preparation we need to look at the different features
# therefore we are looking at one month first to get some insights about the data
# we convert the parquet file into a csv file
df = pd.read_parquet('../data/raw/parquet/' + os.path.join('yellow_tripdata_2019-01.parquet'))
df.to_csv('../data/raw/csv/' + 'yellow_tripdata_2019-01' + '.csv', index=False)

In [None]:
# we start by printing out the columns
month = dd.read_csv('../data/raw/csv/yellow_tripdata_2019-01.csv')
print(month.columns)

In [None]:
month.head(5)

In [None]:
# to filter for specific rows we need to add some values to the dataframe
def convert_to_unix(s):
    return time.mktime(datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())


def return_with_trip_times(month):
    duration = month[['tpep_pickup_datetime','tpep_dropoff_datetime']].compute()
    #pickups and dropoffs to unix time
    duration_pickup = [time.mktime(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timetuple()) for x in duration['tpep_pickup_datetime'].values]
    duration_drop = [time.mktime(datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timetuple()) for x in duration['tpep_dropoff_datetime'].values]
    #calculate duration of trips
    durations = (np.array(duration_drop) - np.array(duration_pickup))/float(60)

    #append durations of trips and speed in miles/hr to a new dataframe
    new_frame = month.compute()

    new_frame['trip_times (min)'] = durations
    new_frame['pickup_times (unix)'] = duration_pickup
    new_frame['Speed (mph)'] = 60*(new_frame['trip_distance']/new_frame['trip_times (min)'])

    return new_frame

frame_with_durations = return_with_trip_times(month)
frame_with_durations.head(10)

In [None]:
frame_with_durations.to_csv('../data/interim/yellow_tripdata_2019-01_durations.csv', index=False)

In [None]:
# next we look at our features to find outliers and values we want to sort out
# First we look at the speed