<h1>Extending the Pandas dataframe; Dask dataframe</h2>

Pandas is a very powerful library for doing data analytics on tabular data like csv files.

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import glob
import os

#Hourly files of lbnl sgp insitu data, 2-3 lines per file
sdir='/home/ccg/mund/tmp/lns/'
odir='output/monthly_means/'
file_filter=sdir+"data1/*-1.csv"    
file_filter2=sdir+"data2/*-1.csv"
out=[];

files=glob.glob(file_filter)
print("Processing ",len(files)," files.")

Processing  76  files.


In [2]:
! ls -lh $file_filter

-rwx------ 1 ccg7 data 111M Jan 26 04:59 /home/ccg/mund/tmp/lns/data1/ghg03-2012-1.csv
-rwx------ 1 ccg7 data  29M Jan 26 04:59 /home/ccg/mund/tmp/lns/data1/ghg03-2013-1.csv
-rwx------ 1 ccg7 data  28M Jan 26 04:59 /home/ccg/mund/tmp/lns/data1/ghg03-2014-1.csv
-rwx------ 1 ccg7 data 105M Jan 26 04:59 /home/ccg/mund/tmp/lns/data1/ghg03-2016-1.csv
-rwx------ 1 ccg7 data 9.1M Jan 26 05:00 /home/ccg/mund/tmp/lns/data1/ghg03-cal-1.csv
-rwx------ 1 ccg7 data 114M Jan 26 05:00 /home/ccg/mund/tmp/lns/data1/ghg06-2012-1.csv
-rwx------ 1 ccg7 data 117M Jan 26 05:00 /home/ccg/mund/tmp/lns/data1/ghg06-2013-1.csv
-rwx------ 1 ccg7 data  97M Jan 26 05:01 /home/ccg/mund/tmp/lns/data1/ghg06-2015-1.csv
-rwx------ 1 ccg7 data 118M Jan 26 05:01 /home/ccg/mund/tmp/lns/data1/ghg06-2017-1.csv
-rwx------ 1 ccg7 data 120M Jan 26 05:01 /home/ccg/mund/tmp/lns/data1/ghg06-2018-1.csv
-rwx------ 1 ccg7 data 149M Jan 26 05:02 /home/ccg/mund/tmp/lns/data1/ghg06-2019-1.csv
-rwx------ 1 ccg7 data 138M Jan 26 05:02 /ho

In [3]:
! head /ccg/non-gmd/en/icp/insitu/2021-01-26/ghg40-b-cal-1.csv

time, type, sample, standard, port, inlet_press, sample_press, sample_temp, ambient_temp, cavity_temp, cavity_press, h2o, co2_Ctank, co2_C, co2_wet, co2_dry, co2_stdev, co2_N, co2_target_error, ch4_Ctank, ch4_C, ch4_wet, ch4_dry, ch4_stdev, ch4_N, ch4_target_error, co_Ctank, co_C, co_wet, co_dry, co_stdev, co_N, co_target_error
2016-03-23 06:07, std, JB03880-A, JB03880-A, 4, 756.8992, 589.9939, 37.9452, 20.6386, 44.9998, 140.1239, 0.1058, 397.2400, nan, nan, nan, 3.4692, 0.0, nan, 1863.5699, nan, nan, nan, 3.1292, 0.0, nan, 99.1600, nan, nan, nan, nan, 0.0, nan
2016-03-23 06:08, std, JB03880-A, JB03880-A, 4, 788.9189, 600.0610, 37.9212, 20.9211, 44.9997, 140.0058, 0.0885, 397.2400, nan, nan, nan, 0.0416, 0.0, nan, 1863.5699, nan, nan, nan, 0.1778, 0.0, nan, 99.1600, nan, nan, nan, nan, 0.0, nan
2016-03-23 06:09, std, JB03880-A, JB03880-A, 4, 788.6281, 600.0565, 37.9143, 21.1759, 44.9990, 140.0018, 0.0883, 397.2400, nan, nan, nan, 0.0411, 0.0, nan, 1863.5699, nan, nan, nan, 0.2112, 0.0,

<h3>Using traditional Pandas dataframes</h3>Loop through a directory, concatenate dataframes and generate monthly averages

In [None]:
%%time
n=0;frames=[]
for f in files:
    frames.append(pd.read_csv(f,usecols=['time',' co2_dry'],parse_dates=['time'],na_values=" nan"))
    n+=1
df=pd.concat(frames)
df=df.dropna()
a=df.groupby(pd.Grouper(key='time',freq='M'))[' co2_dry'].mean()
a.to_csv(odir+'looped_pandas.csv')
print("Rows:",n)

#Rows: 76
#CPU times: user 52.7 s, sys: 6.86 s, total: 59.6 s
#Wall time: 1min 12s

In [None]:
df.info

<h3>Dask dataframe</h3> Extends Pandas to add parallel processing.  Internally, the dask dataframe is a series of Pandas dataframes and it delegates operations to them.

In [4]:
#Import distributed and create a local 4 process client with restricted memory usage.  
from dask.distributed import Client, progress
client = Client(processes=True, threads_per_worker=1,
                n_workers=4, memory_limit='1GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:46295  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 4.00 GB


In [5]:
%%time
import dask.dataframe as dd
df=dd.read_csv(file_filter2,usecols=['time',' co2_dry'],parse_dates=['time'],na_values=" nan",blocksize=64 * 1024 * 1024)
df=df.dropna()
a=df.groupby(pd.Grouper(key='time',freq='M'))[' co2_dry'].mean()

CPU times: user 377 ms, sys: 113 ms, total: 490 ms
Wall time: 1.62 s


In [6]:
df.head()

Unnamed: 0,time,co2_dry
0,2012-01-01 00:00:00,394.741
1,2012-01-01 00:01:00,394.736
2,2012-01-01 00:02:00,394.747
5,2012-01-01 00:04:00,394.886
6,2012-01-01 00:05:00,394.805


In [7]:
%%time
a.to_csv(odir+'dask_dataframe*.csv')

CPU times: user 4.02 s, sys: 429 ms, total: 4.45 s
Wall time: 34.3 s


['/home/ccg/mund/dev/python_class/AtmosphericPythonCourse/dask/output/monthly_means/dask_dataframe0.csv']

In [8]:
client.shutdown()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
