<h1>Extending the Pandas dataframe; Dask dataframe</h2>

Pandas is a very powerful library for doing data analytics on tabular data like csv files.

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import glob
import os

#Hourly files of lbnl sgp insitu data, 2-3 lines per file
sdir='/home/ccg/mund/tmp/lns/'
odir='output/monthly_means/'
file_filter=sdir+"data1/*-1.csv"    
file_filter2=sdir+"data2/*-1.csv"
out=[];

files=glob.glob(file_filter)
print("Processing ",len(files)," files.")

In [None]:
! ls -lh $file_filter

In [None]:
! head /ccg/non-gmd/en/icp/insitu/2021-01-26/ghg40-b-cal-1.csv

<h3>Using traditional Pandas dataframes</h3>Loop through a directory, concatenate dataframes and generate monthly averages

In [None]:
%%time
n=0;frames=[]
for f in files:
    frames.append(pd.read_csv(f,usecols=['time',' co2_dry'],parse_dates=['time'],na_values=" nan"))
    n+=1
df=pd.concat(frames)
df=df.dropna()
a=df.groupby(pd.Grouper(key='time',freq='M'))[' co2_dry'].mean()
a.to_csv(odir+'looped_pandas.csv')
print("Rows:",n)

#Rows: 76
#CPU times: user 52.7 s, sys: 6.86 s, total: 59.6 s
#Wall time: 1min 12s

In [None]:
df.info

<h3>Dask dataframe</h3> Extends Pandas to add parallel processing.  Internally, the dask dataframe is a series of Pandas dataframes and it delegates operations to them.

In [None]:
#Import distributed and create a local 4 process client with restricted memory usage.  
from dask.distributed import Client, progress
client = Client(processes=True, threads_per_worker=1,
                n_workers=4, memory_limit='1GB')
client

In [None]:
%%time
import dask.dataframe as dd
df=dd.read_csv(file_filter2,usecols=['time',' co2_dry'],parse_dates=['time'],na_values=" nan",blocksize=64 * 1024 * 1024)
df=df.dropna()
a=df.groupby(pd.Grouper(key='time',freq='M'))[' co2_dry'].mean()

In [None]:
df.head()

In [None]:
%%time
a.to_csv(odir+'dask_dataframe*.csv')

In [None]:
client.shutdown()