In [1]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import xarray as xr
from IPython.display import display

In [2]:
# load wildfire dataset
df = pd.read_csv("wildfires.csv")
print("loaded df")

loaded df


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# define the dataset settings
mnet_datasets = [
    #{
    #    'name': 'pdsi',
    #    'col': 'palmer_drought_severity_index',
    #    'ops': ['min', 'max', 'avg', 'count']
    #},
    {
        'name': 'pr',
        'col': 'precipitation_amount',
        'ops': ['min', 'max', 'avg', 'count']
    },
    {
        'name': 'rmax',
        'col': 'relative_humidity',
        'ops': ['min', 'max', 'avg', 'count']
    },
    {
        'name': 'rmin',
        'col': 'relative_humidity',
        'ops': ['min', 'max', 'avg', 'count']
    },
    {
        'name': 'tmmn',
        'col': 'air_temperature',
        'ops': ['min', 'max', 'avg', 'count']
    },
    {
        'name': 'tmmx',
        'col': 'air_temperature',
        'ops': ['min', 'max', 'avg', 'count']
    },
    {
        'name': 'vs',
        'col': 'air_temperature',
        'ops': ['min', 'max', 'avg', 'count']
    }
]
# constant values
jan1st1900 = datetime(1900, 1, 1)
progress_count = 100

In [4]:
# main function for executing the different operations for each series
def execute_op(op, entries_df):
    l = len(entries_df)
    # optimization: xarray.sel is inclusive, hence 2 entries is essentially one entry
    if op == 'min':
        return entries_df.min() if l > 2 else entries_df.iloc[0]
    elif op == 'max':
        return entries_df.max() if l > 2 else entries_df.iloc[0]
    elif op == 'avg':
        return entries_df.mean() if l > 2 else entries_df.iloc[0]
    elif op == 'count':
        return l
    return np.nan

In [6]:
# process the data & output progress
years = df['FIRE_YEAR'].unique()
for y in years:
    # go through each dataset per year
    for _, ds_metadata in enumerate(mnet_datasets):
        mnet_ds_filename = 'mnet/%s_%d.nc' % (ds_metadata['name'], y)
        mnet_ds = xr.open_dataset(mnet_ds_filename, decode_cf=False)
        time_start = datetime.now().timestamp()
        processed_count, invalid_count = 0, 0
        
        df_y = df[df['FIRE_YEAR'] == y]
        print("opened up %s" % mnet_ds_filename)
        dh = display('opened up %s' % mnet_ds_filename, display_id=True)
        # loop thru rows 
        for index, row in df_y.iterrows():
            #print("> processing index %d for year %d and dataset %s" % (index, y, mnet_ds_filename))
            # date from year + day of year
            f_year, disc_doy, cont_doy, latitude, longitude = row['FIRE_YEAR'], row['DISCOVERY_DOY'], row['CONT_DOY'], row['LATITUDE'], row['LONGITUDE']
            if np.isnan(f_year) or np.isnan(disc_doy) or np.isnan(latitude) or np.isnan(longitude):
                invalid_count += 1
                print("\tskipping invalid index %d" % (index), f_year, disc_doy, cont_doy, latitude, longitude)
                continue
                
            d_start = datetime(f_year, 1, 1) + timedelta(days=np.float64(disc_doy))
            d_end = datetime(f_year, 1, 1) + (timedelta(days=np.float64(cont_doy)) if not np.isnan(cont_doy) else timedelta(days=np.float64(disc_doy)))
            delta_start = (d_start - jan1st1900).days
            delta_end = (d_end - jan1st1900).days
            lat = np.float64(latitude)
            lon = np.float64(longitude)
            # select relevant entries in mnet_dataset
            mnet_entries = mnet_ds.sel(day=[delta_start, delta_end], lon=lon, lat=lat, method="nearest")
            mnet_df = mnet_entries.to_dataframe()
            # compute operations for entries and store in augment_cols
            augment_cols = dict()
            for op in ds_metadata['ops']:
                col_key = "%s_%s" % (ds_metadata['name'], op)
                augment_cols[col_key] = execute_op(op, mnet_df[ds_metadata['col']])
                # update dataset with new column
                df.at[index, col_key] = augment_cols[col_key]
            processed_count += 1    
            #print("\tentry %d : %d mnet entries in %s: %s" % (index, mnet_df.shape[0], mnet_ds_filename, augment_cols))
            if processed_count % progress_count == 0:
                percentage = (processed_count + invalid_count) / df_y.shape[0]
                dh.update("[%d] '%s' dataset progress: %d / %d [%.2f] (%d invalids)" % (datetime.now().timestamp(), mnet_ds_filename, (processed_count + invalid_count), df_y.shape[0], percentage, invalid_count))
        time_end = datetime.now().timestamp()
        time_delta = time_end - time_start
        print("finished with %s, processed = %d, invalid = %d : took %d seconds" % (mnet_ds_filename, processed_count, invalid_count, time_delta))
    
output_f = "wildfire_augmented_%d.csv" % datetime.now().timestamp() 
print("done processing entries! output is %s" % output_f)
df.to_csv(output_f)

opened up mnet/pdsi_2005.nc


"[1606584785] 'mnet/pdsi_2005.nc' dataset progress: 88600 / 88604 [1.00] (0 invalids)"

finished with mnet/pdsi_2005.nc, processed = 88604, invalid = 0 : took 341 seconds
opened up mnet/pdsi_2004.nc


"[1606585035] 'mnet/pdsi_2004.nc' dataset progress: 69200 / 69279 [1.00] (0 invalids)"

finished with mnet/pdsi_2004.nc, processed = 69279, invalid = 0 : took 250 seconds
opened up mnet/pdsi_2006.nc


"[1606586196] 'mnet/pdsi_2006.nc' dataset progress: 114000 / 114004 [1.00] (0 invalids)"

finished with mnet/pdsi_2006.nc, processed = 114004, invalid = 0 : took 1160 seconds
opened up mnet/pdsi_2008.nc


"[1606586681] 'mnet/pdsi_2008.nc' dataset progress: 85300 / 85378 [1.00] (0 invalids)"

finished with mnet/pdsi_2008.nc, processed = 85378, invalid = 0 : took 485 seconds
opened up mnet/pdsi_2002.nc


"[1606587725] 'mnet/pdsi_2002.nc' dataset progress: 75600 / 75656 [1.00] (0 invalids)"

finished with mnet/pdsi_2002.nc, processed = 75656, invalid = 0 : took 1044 seconds
opened up mnet/pdsi_2007.nc


"[1606589056] 'mnet/pdsi_2007.nc' dataset progress: 95500 / 95573 [1.00] (0 invalids)"

finished with mnet/pdsi_2007.nc, processed = 95573, invalid = 0 : took 1330 seconds
opened up mnet/pdsi_2009.nc


"[1606589613] 'mnet/pdsi_2009.nc' dataset progress: 78300 / 78325 [1.00] (0 invalids)"

finished with mnet/pdsi_2009.nc, processed = 78325, invalid = 0 : took 556 seconds
opened up mnet/pdsi_2001.nc


"[1606589931] 'mnet/pdsi_2001.nc' dataset progress: 86500 / 86587 [1.00] (0 invalids)"

finished with mnet/pdsi_2001.nc, processed = 86587, invalid = 0 : took 318 seconds
opened up mnet/pdsi_2003.nc


"[1606590162] 'mnet/pdsi_2003.nc' dataset progress: 68200 / 68261 [1.00] (0 invalids)"

finished with mnet/pdsi_2003.nc, processed = 68261, invalid = 0 : took 230 seconds
opened up mnet/pdsi_1992.nc


"[1606602105] 'mnet/pdsi_1992.nc' dataset progress: 67900 / 67975 [1.00] (0 invalids)"

finished with mnet/pdsi_1992.nc, processed = 67975, invalid = 0 : took 11943 seconds
opened up mnet/pdsi_1993.nc


"[1606602343] 'mnet/pdsi_1993.nc' dataset progress: 61900 / 61989 [1.00] (0 invalids)"

finished with mnet/pdsi_1993.nc, processed = 61989, invalid = 0 : took 237 seconds
opened up mnet/pdsi_1994.nc


"[1606602613] 'mnet/pdsi_1994.nc' dataset progress: 75900 / 75955 [1.00] (0 invalids)"

finished with mnet/pdsi_1994.nc, processed = 75955, invalid = 0 : took 270 seconds
opened up mnet/pdsi_1995.nc


"[1606602879] 'mnet/pdsi_1995.nc' dataset progress: 71400 / 71472 [1.00] (0 invalids)"

finished with mnet/pdsi_1995.nc, processed = 71472, invalid = 0 : took 265 seconds
opened up mnet/pdsi_1996.nc


"[1606603165] 'mnet/pdsi_1996.nc' dataset progress: 75500 / 75574 [1.00] (0 invalids)"

finished with mnet/pdsi_1996.nc, processed = 75574, invalid = 0 : took 286 seconds
opened up mnet/pdsi_1997.nc


"[1606603425] 'mnet/pdsi_1997.nc' dataset progress: 61400 / 61450 [1.00] (0 invalids)"

finished with mnet/pdsi_1997.nc, processed = 61450, invalid = 0 : took 259 seconds
opened up mnet/pdsi_1998.nc


"[1606603707] 'mnet/pdsi_1998.nc' dataset progress: 68300 / 68370 [1.00] (0 invalids)"

finished with mnet/pdsi_1998.nc, processed = 68370, invalid = 0 : took 282 seconds
opened up mnet/pdsi_1999.nc


"[1606604072] 'mnet/pdsi_1999.nc' dataset progress: 89300 / 89363 [1.00] (0 invalids)"

finished with mnet/pdsi_1999.nc, processed = 89363, invalid = 0 : took 364 seconds
opened up mnet/pdsi_2000.nc


"[1606604446] 'mnet/pdsi_2000.nc' dataset progress: 96400 / 96416 [1.00] (0 invalids)"

finished with mnet/pdsi_2000.nc, processed = 96416, invalid = 0 : took 373 seconds
opened up mnet/pdsi_2010.nc


"[1606604748] 'mnet/pdsi_2010.nc' dataset progress: 79800 / 79889 [1.00] (0 invalids)"

finished with mnet/pdsi_2010.nc, processed = 79889, invalid = 0 : took 302 seconds
opened up mnet/pdsi_2011.nc


"[1606605272] 'mnet/pdsi_2011.nc' dataset progress: 90500 / 90552 [1.00] (0 invalids)"

finished with mnet/pdsi_2011.nc, processed = 90552, invalid = 0 : took 524 seconds
opened up mnet/pdsi_2012.nc


"[1606605582] 'mnet/pdsi_2012.nc' dataset progress: 72700 / 72769 [1.00] (0 invalids)"

finished with mnet/pdsi_2012.nc, processed = 72769, invalid = 0 : took 309 seconds
opened up mnet/pdsi_2013.nc


"[1606605926] 'mnet/pdsi_2013.nc' dataset progress: 64700 / 64780 [1.00] (0 invalids)"

finished with mnet/pdsi_2013.nc, processed = 64780, invalid = 0 : took 344 seconds
opened up mnet/pdsi_2014.nc


"[1606606328] 'mnet/pdsi_2014.nc' dataset progress: 67700 / 67753 [1.00] (0 invalids)"

finished with mnet/pdsi_2014.nc, processed = 67753, invalid = 0 : took 401 seconds
opened up mnet/pdsi_2015.nc


"[1606606735] 'mnet/pdsi_2015.nc' dataset progress: 74400 / 74491 [1.00] (0 invalids)"

finished with mnet/pdsi_2015.nc, processed = 74491, invalid = 0 : took 407 seconds
done processing entries! output is wildfire_augmented_1606606735.csv


In [12]:
ds = xr.open_dataset("mnet/vs_1992.nc", decode_cf=False)
ds