# Analyzing Pandas Serialization Formats

In [15]:
import pandas as pd
import numpy as np
import random
import time
import os

In [16]:
def generate(rows=1000, cols=10, min_value=0, max_value=1000):
    columns = list(range(1, cols + 1))
    df = pd.DataFrame(data=np.random.randint(min_value, max_value,
                                             size=(rows, cols)),
                      columns=columns)
    return df

In [17]:
df = generate(rows=1000, cols=10)
df.shape

(1000, 10)

In [19]:
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,711,453,437,463,597,503,5,957,714,568
1,922,925,303,789,181,255,693,126,584,19
2,480,248,103,975,241,496,721,850,259,341
3,448,121,711,970,305,469,631,119,922,489
4,429,739,256,264,197,124,666,284,846,35


In [5]:
formats = ['csv', 'pickle', 'parquet', 'hdf', 'json', 'feather', 'html'] # xml
df_sizes = [1000, 2000, 5000, 7000, 10000, 20000]
metrics_columns = ['df_size', 'format', 'saving_time', 'loading_time', 'file_size']

df_metrics = pd.DataFrame(data=None, columns=metrics_columns, dtype="object")
df_metrics

Unnamed: 0,df_size,format,saving_time,loading_time,file_size


In [6]:
def get_metrics(df_format, size, df):
    metrics = [size, df_format]
    
    file_path = "../data/test.obj"
    
    if df_format == 'hdf':
        file_path += '.h5'

    ## SAVING
    t0 = time.time()
    
    if df_format == 'csv':
        df.to_csv(file_path)
    elif df_format == 'pickle':
        df.to_pickle(file_path)
    elif df_format == 'parquet':
        df.to_parquet(file_path)
    elif df_format == 'hdf':
        df.to_hdf(file_path, key='main')
    elif df_format == 'json':
        df.to_json(file_path)
    elif df_format == 'xml':
        df.to_xml(file_path)
    elif df_format == 'feather':
        df.to_feather(file_path)
    elif df_format == 'html':
        df.to_html(file_path)
    else:
        print("Undefined format")
        
    t1 = time.time()
    saving_time = t1 - t0

    metrics.append(saving_time)

    
    ## LOADING
    t0 = time.time()
    
    if df_format == 'csv':
        df = pd.read_csv(file_path)
    elif df_format == 'pickle':
        df = pd.read_pickle(file_path)
    elif df_format == 'parquet':
        df = pd.read_parquet(file_path)
    elif df_format == 'hdf':
        df = pd.read_hdf(file_path, key='main')
    elif df_format == 'json':
        df = pd.read_json(file_path)
    elif df_format == 'xml':
        df = pd.read_xml(file_path)
    elif df_format == 'feather':
        df = pd.read_feather(file_path)
    elif df_format == 'html':
        df = pd.read_html(file_path)
    else:
        print("Undefined format")
    
    t1 = time.time()
    loading_time = t1 - t0

    metrics.append(loading_time)

    ## FILE SIZE
    file_size = (os.stat(file_path).st_size) / (1024 ** 2)
    metrics.append(file_size)
    
    return metrics
    

In [7]:
for serial_format in formats:
    for size in df_sizes:
        print(f"Starting {serial_format} for size {size} is done.")
        df = generate(rows=size, cols=10)
        metrics = get_metrics(serial_format, size, df)

        df_metrics_records = pd.DataFrame(data=[metrics], columns=metrics_columns)
        df_metrics = pd.concat([df_metrics, df_metrics_records])
        print(f"Format {serial_format} for size {size} is done.")
        print("")

Starting csv for size 1000 is done.
Format csv for size 1000 is done.

Starting csv for size 2000 is done.


  df_metrics = pd.concat([df_metrics, df_metrics_records])


Format csv for size 2000 is done.

Starting csv for size 5000 is done.
Format csv for size 5000 is done.

Starting csv for size 7000 is done.
Format csv for size 7000 is done.

Starting csv for size 10000 is done.
Format csv for size 10000 is done.

Starting csv for size 20000 is done.
Format csv for size 20000 is done.

Starting pickle for size 1000 is done.
Format pickle for size 1000 is done.

Starting pickle for size 2000 is done.
Format pickle for size 2000 is done.

Starting pickle for size 5000 is done.
Format pickle for size 5000 is done.

Starting pickle for size 7000 is done.
Format pickle for size 7000 is done.

Starting pickle for size 10000 is done.
Format pickle for size 10000 is done.

Starting pickle for size 20000 is done.
Format pickle for size 20000 is done.

Starting parquet for size 1000 is done.
Format parquet for size 1000 is done.

Starting parquet for size 2000 is done.
Format parquet for size 2000 is done.

Starting parquet for size 5000 is done.
Format parque

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_values] [items->Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')]

  df.to_hdf(file_path + '.h5', key='main')


Format hdf for size 1000 is done.

Starting hdf for size 2000 is done.


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_values] [items->Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')]

  df.to_hdf(file_path + '.h5', key='main')


Format hdf for size 2000 is done.

Starting hdf for size 5000 is done.


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_values] [items->Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')]

  df.to_hdf(file_path + '.h5', key='main')


Format hdf for size 5000 is done.

Starting hdf for size 7000 is done.


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_values] [items->Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')]

  df.to_hdf(file_path + '.h5', key='main')


Format hdf for size 7000 is done.

Starting hdf for size 10000 is done.


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_values] [items->Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')]

  df.to_hdf(file_path + '.h5', key='main')


Format hdf for size 10000 is done.

Starting hdf for size 20000 is done.


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block0_values] [items->Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')]

  df.to_hdf(file_path + '.h5', key='main')


Format hdf for size 20000 is done.

Starting json for size 1000 is done.
Format json for size 1000 is done.

Starting json for size 2000 is done.
Format json for size 2000 is done.

Starting json for size 5000 is done.
Format json for size 5000 is done.

Starting json for size 7000 is done.
Format json for size 7000 is done.

Starting json for size 10000 is done.
Format json for size 10000 is done.

Starting json for size 20000 is done.
Format json for size 20000 is done.

Starting feather for size 1000 is done.
Format feather for size 1000 is done.

Starting feather for size 2000 is done.
Format feather for size 2000 is done.

Starting feather for size 5000 is done.
Format feather for size 5000 is done.

Starting feather for size 7000 is done.
Format feather for size 7000 is done.

Starting feather for size 10000 is done.
Format feather for size 10000 is done.

Starting feather for size 20000 is done.
Format feather for size 20000 is done.

Starting html for size 1000 is done.
Format 

In [8]:
df_metrics

Unnamed: 0,df_size,format,saving_time,loading_time,file_size
0,1000,csv,0.002341,0.001214,0.040798
0,2000,csv,0.003147,0.001331,0.082651
0,5000,csv,0.006992,0.002786,0.208302
0,7000,csv,0.009675,0.002857,0.292174
0,10000,csv,0.013166,0.003411,0.417576
0,20000,csv,0.027689,0.006094,0.845834
0,1000,pickle,0.000588,0.000345,0.026903
0,2000,pickle,0.000791,0.000863,0.053178
0,5000,pickle,0.001206,0.001362,0.131677
0,7000,pickle,0.00131,0.001642,0.183928
