In [2]:
import pandas as pd
import numpy as np

In [5]:
def get_dataset(size):
    # Create fake dataset
    df = pd.DataFrame()
    df["size"] = np.random.choice(["big", "medium", "small"], size)
    df["age"] = np.random.randint(1, 50, size)
    df["team"] = np.random.choice(["red", "blue", "yellow", "green"], size)
    df["win"]: np.random.choice(["yes", "no"], size)
    dates = pd.date_range("2023-01-01", "2023-12-31")
    df["date"] = np.random.choice(dates, size)
    df["prob"] = np.random.uniform(0, 1, size)
    
    return df


def set_dtypes(df):
    df["size"] = df["size"].astype("category")
    df["team"] = df["team"].astype("category")
    df["age"] = df["age"].astype("int16")
    df["dq"] = df["dq"].map({"yes": True, "no": False})
    df["prob"] = df["prob"].astype("float16")
    
    return df


In [9]:
df = get_dataset(1_000_000)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  object        
 1   age     1000000 non-null  int64         
 2   team    1000000 non-null  object        
 3   date    1000000 non-null  datetime64[ns]
 4   prob    1000000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 38.1+ MB


In [11]:
df.head()

Unnamed: 0,size,age,team,date,prob
0,big,34,green,2023-05-14,0.83037
1,medium,12,blue,2023-05-17,0.406155
2,medium,28,yellow,2023-06-06,0.197815
3,medium,4,blue,2023-06-20,0.625448
4,big,3,green,2023-12-06,0.008122


# To CSV

In [26]:
%%timeit
df = get_dataset(1_000_000)
df.to_csv("test_csv.csv", index=False)

1.46 s ± 26.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
!ls -GFlash test_csv.csv

100368 -rw-r--r--@ 1 julio  staff    48M Oct  2 06:52 test_csv.csv


In [27]:
%%timeit
df = pd.read_csv("test_csv.csv")

245 ms ± 3.47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
df.head()

Unnamed: 0,size,age,team,date,prob
0,small,23,yellow,2023-08-09,0.630798
1,small,40,yellow,2023-02-02,0.6764
2,big,48,blue,2023-04-17,0.365593
3,big,44,green,2023-06-12,0.780306
4,medium,48,green,2023-12-22,0.88955


# Pickle

In [29]:
df2 = get_dataset(1_000_000)
%timeit df2.to_pickle("test.pickle")
%timeit df_pickle = pd.read_pickle("test.pickle")

212 ms ± 4.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
86.9 ms ± 3.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [30]:
df2.head()


Unnamed: 0,size,age,team,date,prob
0,small,35,red,2023-09-29,0.410612
1,medium,28,green,2023-06-21,0.880576
2,big,3,blue,2023-08-26,0.667634
3,big,9,green,2023-06-11,0.988966
4,small,28,yellow,2023-06-24,0.480027


In [31]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  object        
 1   age     1000000 non-null  int64         
 2   team    1000000 non-null  object        
 3   date    1000000 non-null  datetime64[ns]
 4   prob    1000000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 38.1+ MB


# Parquet

In [35]:
df3 = get_dataset(1_000_000)
%timeit df3.to_parquet("test.parquet")


124 ms ± 985 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [49]:
%timeit df_parquet = pd.read_parquet("test.parquet")
df_parquet = pd.read_parquet("test.parquet")
df_parquet.info()

46.5 ms ± 138 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  object        
 1   age     1000000 non-null  int64         
 2   team    1000000 non-null  object        
 3   date    1000000 non-null  datetime64[ns]
 4   prob    1000000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 38.1+ MB


In [50]:
df_parquet.head()

Unnamed: 0,size,age,team,date,prob
0,small,40,green,2023-07-20,0.293874
1,big,24,blue,2023-08-05,0.085979
2,small,47,red,2023-07-25,0.576642
3,big,3,blue,2023-05-21,0.821238
4,small,4,blue,2023-11-13,0.236005


In [42]:
!ls -GFlash test.parquet

20888 -rw-r--r--@ 1 julio  staff    10M Oct  2 10:03 test.parquet


In [43]:
pd.read_parquet("test.parquet", columns=["date", "size", "age"])

Unnamed: 0,date,size,age
0,2023-07-20,small,40
1,2023-08-05,big,24
2,2023-07-25,small,47
3,2023-05-21,big,3
4,2023-11-13,small,4
...,...,...,...
999995,2023-07-15,small,1
999996,2023-05-21,small,12
999997,2023-07-02,big,30
999998,2023-06-01,small,10


# Feather

In [44]:
df4 = get_dataset(1_000_000)
%timeit df4.to_feather("test.feather")
%timeit df4_feather = pd.read_feather("test.feather")

58.5 ms ± 3.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
36.5 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [46]:
df4_feather = pd.read_feather("test.feather")

In [48]:
df4_feather.info()
df4_feather.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype         
---  ------  --------------    -----         
 0   size    1000000 non-null  object        
 1   age     1000000 non-null  int64         
 2   team    1000000 non-null  object        
 3   date    1000000 non-null  datetime64[ns]
 4   prob    1000000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 38.1+ MB


Unnamed: 0,size,age,team,date,prob
0,small,15,blue,2023-06-13,0.065698
1,big,40,yellow,2023-08-13,0.348198
2,medium,28,yellow,2023-03-05,0.220668
3,big,39,yellow,2023-09-20,0.487917
4,medium,1,blue,2023-03-21,0.348706
