# CSV to Parquet Conversion

Parquet has a much smaller footprint allowing you to reduce storage space and improve performance when loading the data to memory. This transformation is particularly important for out-of-memory computation with increased IO for large datasets. Parquet is column-based storage, making column-based operations particularly effective. 

## Initialization

In [1]:
import dask.dataframe as dd
import numpy as np
import fastparquet
import dask.delayed

In [2]:
from pathlib import PurePath

input_directory = "../data/"
filename = '2018_Yellow_Taxi_Trip_Data'
extension = '.csv'
csv_sep = ','
input_file = PurePath(input_directory, filename + extension)

output_directory = PurePath(input_directory, filename)
output_filename_base = filename

## Start local Dask client

In [4]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
#     cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
    cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Restarting client


0,1
Client  Scheduler: tcp://127.0.0.1:33639  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 25.97 GB


## Get all available columns

In [5]:
ddf = dd.read_csv(input_file, sep=csv_sep)
columns = ddf.columns.values
for i, column in enumerate(columns):
    print(str(i) + ': ' + column)

0: VendorID
1: tpep_pickup_datetime
2: tpep_dropoff_datetime
3: passenger_count
4: trip_distance
5: RatecodeID
6: store_and_fwd_flag
7: PULocationID
8: DOLocationID
9: payment_type
10: fare_amount
11: extra
12: mta_tax
13: tip_amount
14: tolls_amount
15: improvement_surcharge
16: total_amount


## Data Interface

In [6]:
categorical_features = ['VendorID', 
                        'RatecodeID', 
                        'PULocationID',
                        'DOLocationID',
                        'payment_type',
                       ]
datetime_features = ['tpep_pickup_datetime',
                    'tpep_dropoff_datetime']

numerical_features = ['passenger_count', 
                     'trip_distance', 
                     'fare_amount', 
                      'extra',
                      'mta_tax',
                      'tip_amount',
                      'tolls_amount',
                      'improvement_surcharge',
                      'total_amount'
                     ]

string_features = ['store_and_fwd_flag']

# Type dict to improve dynamic loading of csv
dtypes = {**{col: 'category' for col in categorical_features}, \
         **{col: 'float64' for col in numerical_features}}

In [7]:
columns_to_load = categorical_features + datetime_features + numerical_features + string_features

In [8]:
ddf = dd.read_csv(input_file, 
                  usecols = columns_to_load, 
                  dtype=dtypes, 
                  sep=csv_sep, 
                  parse_dates = datetime_features, 
                  blocksize = 32e6)

In [9]:
# Row count
# print(ddf.shape[0].compute())

## Preprocessing

In [10]:
# ddf.head()
# Transforming with Pandas

In [11]:
# Convert datetime columns to a python datetime format
# import dateutil.parser as dparser

In [12]:
# result = ddf['tpep_pickup_datetime'].head()

In [13]:
# ddf['tpep_dropoff_datetime'] = ddf['tpep_dropoff_datetime'].apply(dparser.parse, meta='datetime.datetime')

In [14]:
# Test 
# single_result = result.iloc[0]
# print(type(dparser.parse(single_result)))

In [15]:
# Map store and fwd flag to boolean N -> False, Y -> True
# def bool_conversion(val):
#     if val =='T': return True
#     elif val == 'N': return False
#     else: return None
    
# df['store_and_fwd_flag'] = df['store_and_fwd_flag'].apply(bool_conversion, meta='bool')

## Divide files by category

In [16]:
def create_parquet_file(ddf, output_filepath):
    dd.to_parquet(ddf, output_filepath)

In [17]:
create_parquet_file(ddf, output_directory)

## Test performance in Dask file reading

In [18]:
# output_directory = PurePath('datetime_yellow_taxi_parquet')
test_ddf = dd.read_parquet(output_directory)
print(test_ddf['tpep_pickup_datetime'].head())
# output_directory

index
0   2018-12-03 09:58:01
1   2018-12-03 09:41:32
2   2018-12-03 08:54:36
3   2018-12-03 09:02:08
4   2018-12-03 09:10:10
Name: tpep_pickup_datetime, dtype: datetime64[ns]


## Parquet vs CSV reading speed test

In [19]:
csv_ddf = dd.read_csv(input_file, usecols = columns_to_load, dtype=dtypes, sep=csv_sep)
pq_ddf = dd.read_parquet(output_directory, usecols = columns_to_load, dtype=dtypes)

In [20]:
def average_result(ddf):
    # Have to transform to float64 to prevent overflow and inf outcomes
    ddf['trip_distance'] = ddf['trip_distance'].astype(np.float64)
    return ddf.groupby('VendorID')['trip_distance'].mean()

### CSV 

In [21]:
# File size is ~ 10Gb
%time result = average_result(csv_ddf).compute()    
print(result)

CPU times: user 7.88 s, sys: 613 ms, total: 8.49 s
Wall time: 1min 10s
VendorID
1    2.791033
2    3.031729
4    2.703563
Name: trip_distance, dtype: float64


### Parquet

In [22]:
# Parquet directory size ~ 2.9G
%time result = average_result(pq_ddf).compute()    
print(result)

CPU times: user 4.84 s, sys: 389 ms, total: 5.23 s
Wall time: 26.8 s
VendorID
1    2.791033
2    3.031729
4    2.703563
Name: trip_distance, dtype: float64
