In [2]:
import numpy as np
import pandas as pd

def get_dataset(size: int = 0) -> pd.DataFrame:
    df = pd.DataFrame()
    df["NFe"] = np.random.randint(10_000_000, 99_999_999, size)
    df["Estado"] = np.random.choice(["PB", "PE", "SP", "RJ"], size)
    dates = pd.date_range('2023-01-01', '2024-12-31')
    df["Date"] = np.random.choice(dates, size)
    df["Item"] = np.random.choice(["Coca-Cola", "Água", "Pepsi"], size)
    df["Price"] = np.random.uniform(1, 5, size)
    return df

def set_dtypes(df: pd.DataFrame = []) -> pd.DataFrame:
    df["NFe"] = df["NFe"].astype('int64')
    df["Estado"] = df["Estado"].astype('category')
    df["Item"] = df["Item"].astype('category')
    df["Price"] = df["Price"].astype('float32')
    return df

df = get_dataset(1_000_000)
df = set_dtypes(df)
df.head()



Unnamed: 0,NFe,Estado,Date,Item,Price
0,22520198,SP,2023-12-30,Coca-Cola,2.062974
1,87499691,SP,2024-11-10,Coca-Cola,2.495694
2,91615650,RJ,2024-06-20,Água,3.314404
3,96223808,PB,2024-08-25,Pepsi,3.125198
4,17905966,PE,2023-02-14,Coca-Cola,4.43034


# CSV

In [21]:
%%timeit
df.to_csv('../data/df.csv', index=True)

3.27 s ± 40 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
%%timeit
aux = pd.read_csv('../data/df.csv')

366 ms ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
!du -sh ../data/df.csv

 45M	../data/df.csv


# Pickle

In [25]:
%%timeit
df.to_pickle('../data/df.pkl')

6.97 ms ± 638 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
%%timeit
aux = pd.read_pickle('../data/df.pkl')

2.52 ms ± 29.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [28]:
!du -sh ../data/df.pkl

 21M	../data/df.pkl


# Parquet

In [31]:
#!pip install pyarrow
#!pip install fastparquet

In [32]:
%%timeit
df.to_parquet('../data/df.parquet')

94.7 ms ± 6.72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
%%timeit
aux = pd.read_parquet('../data/df.parquet')

16.7 ms ± 2.54 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Feather

In [3]:
%%timeit
df.to_feather('../data/df.feather')

27.8 ms ± 638 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
%%timeit
aux = pd.read_feather('../data/df.feather')

10.9 ms ± 451 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
!du -sh ../data/* | sort -hr

 45M	../data/df.csv
 21M	../data/df.pkl
 13M	../data/df.feather
 12M	../data/df.parquet
