## Create daily netcdf files for the period 1991 - 2015
There are two types of files. The new ones have a bunch of data variables and the old ones just have time, lat, lon, amplitude, and strokes. 

In [1]:
import os
import numpy as np
import pandas as pd
import xarray as xr

In [51]:
old_path = '/run/media/jsignell/WRF/Data/LIGHT/Data_1991-2009/'
new_path = '/run/media/jsignell/WRF/Data/LIGHT/raw/'
out_path = '/home/jsignell/erddapData/Cloud_to_Ground_Lightning/US/'

In [2]:
f = open('messed_up_old_files.txt')
l = f.readlines()
l=[fname.strip() for fname in l]
f.close()
l.sort()

In [3]:
def fname_to_ncfile(fname, old=False, new=True):
    if new:
        tstr = '{y}-{doy}'.format(y=fname[6:10], doy=fname[11:14])
        ncfile = str(pd.datetime.strptime(tstr, '%Y-%j').date()).replace('-','_')+'.nc'
        return(ncfile)

Let's check to see how we are doing

In [53]:
[d for d in pd.date_range('1991-01-01','2015-09-30').astype(str) if (d+'.nc').replace('-','_') not in os.listdir(out_path)]

['1991-02-01',
 '1991-12-05',
 '1992-01-17',
 '1992-12-18',
 '1992-12-21',
 '1993-10-31',
 '1994-07-31',
 '1997-10-12',
 '1997-10-13',
 '1998-11-23',
 '2000-01-06',
 '2002-02-25',
 '2003-12-31',
 '2004-02-08',
 '2006-02-08',
 '2007-01-16',
 '2007-02-06',
 '2007-12-18',
 '2008-12-05',
 '2009-01-17',
 '2013-01-16']

In [54]:
d = {}
for y in range(1991, 2016):
    d.update({y: len([f for f in os.listdir(out_path) if str(y) in f])})
d

{1991: 363,
 1992: 363,
 1993: 364,
 1994: 364,
 1995: 365,
 1996: 366,
 1997: 363,
 1998: 364,
 1999: 365,
 2000: 365,
 2001: 365,
 2002: 364,
 2003: 364,
 2004: 365,
 2005: 365,
 2006: 364,
 2007: 362,
 2008: 365,
 2009: 364,
 2010: 365,
 2011: 365,
 2012: 366,
 2013: 364,
 2014: 365,
 2015: 273}

Chunking doesn't work well if there are too few records. For some reason only the time values were written to the files in these cases. In order to make up for this, we will find all the really small files and then get the year and day of year for these files. Using this info we can rewrite just the files that got messed up. 

In [8]:
import os
out_path = '/home/jsignell/erddapData/Cloud_to_Ground_Lightning/'
little = []
for fname in os.listdir(out_path):
    if os.stat(out_path+fname).st_size <8000:
        little.append(fname)

In [10]:
import pandas as pd
new_path = '/run/media/jsignell/WRF/Data/LIGHT/raw/'
fnames = []
for fname in os.listdir(new_path):
    for l in little:
        t = pd.Timestamp(l.partition('.')[0].replace('_','-'))
        if '{y}.{doy:03d}'.format(y=t.year, doy=t.dayofyear) in fname:
            fnames.append(fname)

In [16]:
fname = l[4]

In [17]:
df = pd.read_csv(old_path+fname, delim_whitespace=True, header=None, names=['D', 'T','lat','lon','amplitude','strokes'])

In [30]:
df['T'][7430175]

'00:00:00'

In [31]:
df = df.drop(7430175)

In [36]:
s = pd.to_datetime(df['D']+' '+df['T'], errors='coerce')

In [44]:
df = df[['time', 'lat', 'lon', 'amplitude', 'strokes']]

In [45]:
df.head()

Unnamed: 0,time,lat,lon,amplitude,strokes
0,2009-08-01,39.032,-119.729,-14.7,6
1,2009-08-01,36.763,-102.928,-14.9,1
2,2009-08-01,33.577,-84.241,-34.9,2
3,2009-08-01,41.409,-99.739,-9.6,1
4,2009-08-01,38.925,-102.605,-15.8,2


In [46]:
df[df.time.isnull()]

Unnamed: 0,time,lat,lon,amplitude,strokes


In [79]:
df['strokes'] = df['strokes'].astype(int)

In [77]:
df.strokes[df.strokes == '503/12/08']

724017    503/12/08
Name: strokes, dtype: object

In [78]:
df = df.drop(724017)

In [49]:
days = np.unique(df.time.apply(lambda x: x.date()))
for day in days:
    df0 = df[(df.time >= day) & (df.time < day+pd.DateOffset(1))]
    df0 = df0.reset_index()
    df0.index.name = 'record'
    write_day(df0, out_path)
    print day

2009-08-01
2009-08-02
2009-08-03
2009-08-04
2009-08-05
2009-08-06
2009-08-07
2009-08-08
2009-08-09
2009-08-10
2009-08-11
2009-08-12
2009-08-13
2009-08-14
2009-08-15
2009-08-16
2009-08-17
2009-08-18
2009-08-19
2009-08-20
2009-08-21
2009-08-22
2009-08-23
2009-08-24
2009-08-25
2009-08-26
2009-08-27
2009-08-28
2009-08-29
2009-08-30
2009-08-31


In [135]:
import os
import numpy as np
import pandas as pd
import xarray as xr

new_path = '/run/media/jsignell/WRF/Data/LIGHT/raw/'
out_path = '/home/jsignell/erddapData/Cloud_to_Ground_Lightning/'

def new_files(path, fname, out_path):
    df = pd.read_csv(path+fname, delim_whitespace=True, header=None, parse_dates={'time':[0,1]})
    df = df.drop(5, axis=1)
        
    df.columns = ['time', 'lat', 'lon', 'amplitude','strokes',
                  'semimajor','semiminor','ratio','angle','chi_squared','nsensors','cloud_ground']
    df.index.name = 'record'
    
    attrs = {'semimajor': {'long_name': 'Semimajor Axis of 50% probability ellipse for each flash',
                           'units': 'km'},
             'semiminor': {'long_name': 'Semiminor Axis of 50% probability ellipse for each flash',
                           'units': 'km'},
             'ratio': {'long_name': 'Ratio of Semimajor to Semiminor'},
             'angle': {'long_name': 'Angle of 50% probability ellipse from North',
                       'units': 'Deg'},
             'chi_squared': {'long_name': 'Chi-squared value of statistical calculation'},
             'nsensors': {'long_name': 'Number of sensors reporting the flash'},
             'cloud_ground': {'long_name': 'Cloud_to_Ground or In_Cloud Discriminator'}}


    ds = df.to_xarray()
    ds.set_coords(['time','lat','lon'], inplace=True)
    if df.shape[0] < 5:
        chunk=1
    else:
        chunk = min(df.shape[0]/5, 1000)
    for k, v in attrs.items():
        ds[k].attrs.update(v)
        if k == 'cloud_ground':
            ds[k].encoding.update({'dtype': 'S1'})
        elif k == 'nsensors':
            ds[k].encoding.update({'dtype': np.int32, 'chunksizes':(chunk,),'zlib': True})
        else:
            ds[k].encoding.update({'dtype': np.double,'chunksizes':(chunk,),'zlib': True})

    ds.amplitude.attrs.update({'units': 'kA',
                               'long_name': 'Polarity and strength of strike'})
    ds.amplitude.encoding.update({'dtype': np.double,'chunksizes':(chunk,),'zlib': True})
    ds.strokes.attrs.update({'long_name': 'multiplicity of flash'})
    ds.strokes.encoding.update({'dtype': np.int32,'chunksizes':(chunk,),'zlib': True})
    ds.lat.attrs.update({'units': 'degrees_north',
                         'axis': 'Y',
                         'long_name': 'latitude',
                         'standard_name': 'latitude'})
    ds.lat.encoding.update({'dtype': np.double,'chunksizes':(chunk,),'zlib': True})
    ds.lon.attrs.update({'units': 'degrees_east',
                         'axis': 'X',
                         'long_name': 'longitude',
                         'standard_name': 'longitude'})
    ds.lon.encoding.update({'dtype': np.double,'chunksizes':(chunk,),'zlib': True})
    ds.time.encoding.update({'units':'seconds since 1970-01-01', 
                             'calendar':'gregorian',
                             'dtype': np.double,'chunksizes':(chunk,),'zlib': True})

    ds.attrs.update({ 'title': 'Cloud to Ground Lightning',
                      'institution': 'Data from NLDN, hosted by Princeton University',
                      'references': 'https://ghrc.nsstc.nasa.gov/uso/ds_docs/vaiconus/vaiconus_dataset.html',
                      'featureType': 'point',
                      'Conventions': 'CF-1.6',
                      'history': 'Created by Princeton University Hydrometeorology Group at {now} '.format(now=pd.datetime.now()),
                      'author': 'jsignell@princeton.edu',
                      'keywords': 'lightning'})

    date = df.time[len(df.index)/2]
    ds.to_netcdf('{out_path}{y}_{m:02d}_{d:02d}.nc'.format(out_path=out_path, y=date.year, m=date.month, d=date.day), 
                 format='netCDF4', engine='netcdf4')

for fname in fnames:
    try:
        new_files(new_path, fname, out_path)
        print fname
    except:
        f = open('messed_up_new_files.txt', 'a')
        f.write(fname+'\n')
        f.close()


Nflash2010.336_daily_v3_lit.raw
Nflash2010.350_daily_v3_lit.raw
Nflash2011.048_daily_v3_lit.raw
Nflash2010.304_daily_v3_lit.raw
Nflash2010.310_daily_v3_lit.raw
Nflash2010.311_daily_v3_lit.raw
Nflash2010.312_daily_v3_lit.raw
Nflash2010.313_daily_v3_lit.raw
Nflash2010.318_daily_v3_lit.raw
Nflash2010.323_daily_v3_lit.raw
Nflash2010.324_daily_v3_lit.raw
Nflash2010.325_daily_v3_lit.raw
Nflash2010.330_daily_v3_lit.raw
Nflash2010.331_daily_v3_lit.raw
Nflash2010.332_daily_v3_lit.raw
Nflash2010.335_daily_v3_lit.raw
Nflash2010.337_daily_v3_lit.raw
Nflash2010.338_daily_v3_lit.raw
Nflash2010.343_daily_v3_lit.raw
Nflash2010.344_daily_v3_lit.raw
Nflash2010.349_daily_v3_lit.raw
Nflash2010.351_daily_v3_lit.raw
Nflash2010.355_daily_v3_lit.raw
Nflash2010.356_daily_v3_lit.raw
Nflash2010.357_daily_v3_lit.raw
Nflash2010.361_daily_v3_lit.raw
Nflash2010.362_daily_v3_lit.raw
Nflash2011.003_daily_v3_lit.raw


In [48]:
import os
import numpy as np
import pandas as pd
import xarray as xr

old_path = '/run/media/jsignell/WRF/Data/LIGHT/Data_1991-2009/'
out_path = '/home/jsignell/erddapData/Cloud_to_Ground_Lightning/'
    
def write_day(df, out_path):

    ds = df.drop('index', axis=1).to_xarray()
    ds.set_coords(['time','lat','lon'], inplace=True)
    
    ds.amplitude.attrs.update({'units': 'kA',
                               'long_name': 'Polarity and strength of strike'})
    ds.amplitude.encoding.update({'dtype': np.double})
    ds.strokes.attrs.update({'long_name': 'multiplicity of flash'})
    ds.strokes.encoding.update({'dtype': np.int32})
    ds.lat.attrs.update({'units': 'degrees_north',
                         'axis': 'Y',
                         'long_name': 'latitude',
                         'standard_name': 'latitude'})
    ds.lat.encoding.update({'dtype': np.double})
    ds.lon.attrs.update({'units': 'degrees_east',
                         'axis': 'X',
                         'long_name': 'longitude',
                         'standard_name': 'longitude'})
    ds.lon.encoding.update({'dtype': np.double})
    ds.time.encoding.update({'units':'seconds since 1970-01-01', 
                             'calendar':'gregorian',
                             'dtype': np.double})

    ds.attrs.update({ 'title': 'Cloud to Ground Lightning',
                      'institution': 'Data from NLDN, hosted by Princeton University',
                      'references': 'https://ghrc.nsstc.nasa.gov/uso/ds_docs/vaiconus/vaiconus_dataset.html',
                      'featureType': 'point',
                      'Conventions': 'CF-1.6',
                      'history': 'Created by Princeton University Hydrometeorology Group at {now} '.format(now=pd.datetime.now()),
                      'author': 'jsignell@princeton.edu',
                      'keywords': 'lightning'})


    date = df.time[len(df.index)/2]
    
    ds.to_netcdf('{out_path}{y}_{m:02d}_{d:02d}.nc'.format(out_path=out_path, y=date.year, m=date.month, d=date.day), 
                 format='netCDF4', engine='netcdf4')

def old_files(path, fname, out_path):
    df = pd.read_csv(path+fname, delim_whitespace=True, header=None, names=['D', 'T','lat','lon','amplitude','strokes'],
                     parse_dates={'time':[0,1]})
    
    days = np.unique(df.time.apply(lambda x: x.date()))
    for day in days:
        df0 = df[(df.time >= day) & (df.time < day+pd.DateOffset(1))]
        df0 = df0.reset_index()
        df0.index.name = 'record'
        write_day(df0, out_path)
        
'''
for fname in os.listdir(old_path):
    try:
        old_files(old_path, fname, out_path)
    except:
        f = open('messed_up_old_files.txt', 'a')
        f.write(fname+'\n')
        f.close()
'''

"\nfor fname in os.listdir(old_path):\n    try:\n        old_files(old_path, fname, out_path)\n    except:\n        f = open('messed_up_old_files.txt', 'a')\n        f.write(fname+'\n')\n        f.close()\n"

In [None]:
    if df0.shape[0] >1000:
        chunks={'chunksizes':(1000,),'zlib': True}
    else:
        chunks={}
    for v in ds.data_vars.keys()+ds.coords.keys():
        if v =='strokes':
            continue
        ds[v].encoding.update(chunks)