# Storage Benchmark

### Loading Libraries

In [4]:
# Data Manipulation
import pandas as pd

# Numerical Computing
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# OS
import random
import string

# Path & File Access
from pathlib import Path

# Warnings
import warnings

In [5]:
sns.set_style('whitegrid')

warnings.filterwarnings('ignore')

In [6]:
results = {}

### Generate Test Data

In [7]:
def generate_test_data(nrows=100000, numerical_cols=2000, text_cols=0, text_length=10):
    s = "".join([random.choice(string.ascii_letters)
                 for _ in range(text_length)])
    data = pd.concat([pd.DataFrame(np.random.random(size=(nrows, numerical_cols))),
                      pd.DataFrame(np.full(shape=(nrows, text_cols), fill_value=s))],
                     axis=1, ignore_index=True)
    data.columns = [str(i) for i in data.columns]
    return data

In [8]:
data_type = 'Numeric'

In [9]:
df = generate_test_data(numerical_cols=1000, text_cols=1000)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 2000 entries, 0 to 1999
dtypes: float64(1000), object(1000)
memory usage: 1.5+ GB


### Parquet

#### Size

In [10]:
parquet_file = Path('test.parquet')

In [15]:
df.to_parquet(parquet_file)

size = parquet_file.stat().st_size

### Read

In [16]:
%%timeit -o
df = pd.read_parquet(parquet_file)

804 ms ± 3.32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 804 ms ± 3.32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [17]:
read = _

In [18]:
parquet_file.unlink()

#### Write

In [19]:
%%timeit -o
df.to_parquet(parquet_file)
parquet_file.unlink()

7.43 s ± 103 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 7.43 s ± 103 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [20]:
write = _

#### Results

In [21]:
results['Parquet'] = {'read': np.mean(read.all_runs), 'write': np.mean(write.all_runs), 'size': size}

### HDF5

In [22]:
test_store = Path('index.h5')

### Fixed Format

#### Size

In [23]:
with pd.HDFStore(test_store) as store:
    store.put('file', df)
    
size = test_store.stat().st_size

#### Read

In [24]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    store.get('file')

51.4 s ± 799 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 51.4 s ± 799 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [25]:
read = _

In [26]:
test_store.unlink()

#### Write

In [27]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    store.put('file', df)
test_store.unlink()

14.2 s ± 187 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 14.2 s ± 187 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [28]:
write = _

#### Results

In [29]:
results['HDF Fixed'] = {'read': np.mean(read.all_runs), 'write': np.mean(write.all_runs), 'size': size}

### Table Format

#### Size

In [30]:
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t')
    
size = test_store.stat().st_size    

#### Read

In [31]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    df = store.get('file')

15.9 s ± 912 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 15.9 s ± 912 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [32]:
read = _

In [33]:
test_store.unlink()

#### Write

In [34]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t')
test_store.unlink()    

29.7 s ± 1.87 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 29.7 s ± 1.87 s per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [35]:
write = _

#### Result

In [36]:
results['HDF Table'] = {'read': np.mean(read.all_runs), 'write': np.mean(write.all_runs), 'size': size}

### Table Select

#### Size

In [38]:
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t', data_columns=['company', 'form'])
    
size = test_store.stat().st_size 

#### Read

In [39]:
company = 'APPLE INC'

In [40]:
%%timeit
with pd.HDFStore(test_store) as store:
    s = store.get('file')

15.3 s ± 1.34 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [41]:
read = _

In [42]:
test_store.unlink()

#### Write

In [43]:
%%timeit
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t', data_columns=['company', 'form'])
test_store.unlink() 

29.7 s ± 1.88 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [44]:
write = _

#### Results

In [45]:
results['HDF Select'] = {'read': np.mean(read.all_runs), 'write': np.mean(write.all_runs), 'size': size}

### CSV

In [48]:
test_csv = Path('test.csv')

#### Size

In [49]:
df.to_csv(test_csv)
test_csv.stat().st_size

3027570280

#### Read

In [50]:
%%timeit -o
df = pd.read_csv(test_csv)

9.75 s ± 353 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 9.75 s ± 353 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [51]:
read = _

In [52]:
test_csv.unlink()  

#### Write

In [53]:
%%timeit -o
df.to_csv(test_csv)
test_csv.unlink()

1min 4s ± 429 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 1min 4s ± 429 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [54]:
write = _

#### Results

In [55]:
results['CSV'] = {'read': np.mean(read.all_runs), 'write': np.mean(write.all_runs), 'size': size}

### Store Results

In [56]:
pd.DataFrame(results).assign(Data=data_type).to_csv(f'{data_type}.csv')

### Display Results

In [58]:
# df = (pd.read_csv('Numeric.csv', index_col=0)
#       .append(pd.read_csv('Mixed.csv', index_col=0))
#       .rename(columns=str.capitalize))
# df.index.name='Storage'
# df = df.set_index('Data', append=True).unstack()
# df.Size /= 1e9

In [60]:
df1 = pd.read_csv('Numeric.csv', index_col=0)
df2 = pd.read_csv('Mixed.csv', index_col=0)

df = (pd.concat([df1, df2])
        .rename(columns=str.capitalize))

df.index.name = 'Storage'
df = df.set_index('Data', append=True).unstack()
df.Size /= 1e9

In [63]:
fig, axes = plt.subplots(ncols=3, figsize=(16, 4))
for i, op in enumerate(['Read', 'Write', 'Size']):
    flag= op in ['Read', 'Write']
    df.loc[:, op].plot.barh(title=op, ax=axes[i], logx=flag)
    if flag:
        axes[i].set_xlabel('seconds (log scale)')
    else:
        axes[i].set_xlabel('GB')
fig.tight_layout()
fig.savefig('storage', dpi=300);