# CSV to Parquet Conversion

Parquet has a much smaller footprint allowing you to reduce storage space and improve performance when loading the data to memory. This transformation is particularly important for out-of-memory computation with increased IO for large datasets. Parquet is column-based storage, making column-based operations particularly effective. 

## Initialization

In [1]:
import dask.dataframe as dd
import numpy as np
import pyarrow as pa

In [2]:
from pathlib import PurePath

input_directory = "../data/"
filename = ''
extension = '.csv'
csv_sep = '\t'
input_file = PurePath(input_directory, filename + extension)

output_directory = PurePath(input_directory, filename + '_parquet')
output_filename_base = filename

## Start local Dask client

In [3]:

# from dask.distributed import Client, LocalCluster
# try:
#     if client:
#         print('Restarting client')
#         client.restart()
# except:
#     cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
#     print('Setting new client')
#     client = Client(cluster)
#     print(client)
# client

Setting new client
<Client: 'tcp://127.0.0.1:36981' processes=5 threads=10, memory=20.00 GB>


0,1
Client  Scheduler: tcp://127.0.0.1:36981  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 20.00 GB


## Get all available columns

In [None]:
ddf = dd.read_csv(input_file, sep=sep)
columns = ddf.columns.values
for i, column in enumerate(columns):
    print(str(i) + ': ' + column)

## Data Interface

In [1]:
categorical_features = ['']
datetime_features = ['']
numerical_features = ['']
target_columns = ['']

# Type dict to improve dynamic loading of csv
dtypes = {**{col: 'category' for col in categorical_features}, \
         **{col: 'float64' for col in numerical_features},\
        **{col: 'category' for col in target_columns}}

In [6]:
columns_to_load = categorical_features + datetime_features + numerical_features + target_columns

In [7]:
ddf = dd.read_csv(input_file, usecols = columns_to_load, dtype=dtypes, sep=sep, parse_dates = datetime_features)

In [None]:
ddf.head()

## Divide files by category

In [9]:
import dask.delayed
        
def create_parquet_file(ddf, output_filepath):
    dd.to_parquet(ddf, output_filepath)
                            

In [10]:
create_parquet_file(ddf, output_directory)

## Test performance in Dask file reading

In [None]:
test_ddf = dd.read_parquet(output_directory, sep=sep)
print(test_ddf.head())
# output_directory

## Parquet vs CSV reading speed test

In [12]:
csv_ddf = dd.read_csv(input_file, usecols = columns_to_load, dtype=dtypes, sep=sep)
pq_ddf = dd.read_parquet(output_directory, usecols = columns_to_load, dtype=dtypes, sep=sep)

In [13]:
def average_result(ddf):
    # Have to transform to float64 to prevent overflow and inf outcomes
    ddf[''] = ddf[''].astype(np.float64)
    return ddf.groupby('')[''].mean()

### CSV 

In [None]:
# File size is ~ G
%time result = average_result(csv_ddf).compute()    
print(result)

### Parquet

In [None]:
# Parquet directory size ~ G
%time result = average_result(pq_ddf).compute()    
print(result)