<h1>When not to use Dask Dataframe</h1>

Loading many or small, or especially, many small files is best done directly in Pandas. The extra complexity and overhead of parallelizing operations is not always worth it.

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import glob
import os

#Hourly files of lbnl sgp insitu data, 2-3 lines per file
sdir='/ccg/non-gmd/lbnl/icp/insitu/2020-10-15/'
odir='output/monthly_means/'
file_filter=sdir+"SGP_PGSISO*2020*csv"
out=[];

files=glob.glob(file_filter)
print("Processing ",len(files)," files.")

In [None]:
! ls -lh $file_filter

In [None]:
! cat /ccg/non-gmd/lbnl/icp/insitu/2020-10-15/SGP_PGSISO_b1_20200101_0000.csv

<h3>Using traditional Pandas dataframes</h3>Loop through a directory, concatenate dataframes and generate monthly averages

In [None]:
%%time
n=0;frames=[]
for f in files:
    frames.append(pd.read_csv(f,sep=',',comment='#',usecols=['TIMESTAMP','CO2_AVG_CORR'],parse_dates=['TIMESTAMP']))
    n+=1
df=pd.concat(frames)
a=df.groupby(pd.Grouper(key='TIMESTAMP',freq='M'))['CO2_AVG_CORR'].mean()
a.to_csv(odir+'looped_pandas.csv')
print("Rows:",n)
    

<h3>Dask dataframe</h3> Extends Pandas to add parallel processing.  Internally, the dask dataframe is a series of Pandas dataframes and it delegates operations to them.

In [None]:
#Import distributed and create a local 4 process client with restricted memory usage.  
from dask.distributed import Client, progress
client = Client(processes=True, threads_per_worker=1,
                n_workers=4, memory_limit='1GB')
client

In [None]:
%%time
import dask.dataframe as dd
df=dd.read_csv(file_filter,sep=',',comment='#',usecols=['TIMESTAMP','CO2_AVG_CORR'],parse_dates=['TIMESTAMP'])

In [None]:
df.head()

In [None]:
a=df.groupby(pd.Grouper(key='TIMESTAMP',freq='M'))['CO2_AVG_CORR'].mean()

In [None]:
%%time
a.to_csv(odir+'dask_dataframe*.csv')

In [None]:
client.shutdown()

<h3>Dask has overhead</h3>Pandas is very optimized and generally faster when dealing with many/small files that easily fit into memory.