In [1]:
from __init__ import *

In [2]:
import glob

In [3]:
cd example_data/2007

C:\Users\Julia\Documents\GitHub\campbellsci-tools\example_data\2007


In [4]:
files = glob.glob('*.dat')

Given a campbellsci format text file, convert it to a pandas.DataFrame object

In [195]:
def createDF(input_dict, attrs, numeric=True, local_time=True, split=None):
    '''Create a list of dataframes in UTC from a .dat file.'''
    input_file = posixpath.join(input_dict['path'], input_dict['filename'])
    kw = dict(parse_dates=True, index_col=0, iterator=True,
              chunksize=100000, low_memory=False)
    if input_dict['has_header'] is True:
        df_ = pd.read_csv(input_file, skiprows=[0, 2, 3], **kw)
    else:
        df_header = pd.read_csv(input_dict['header_file'],
                                skiprows=[0], nrows=1)
        header = list(df_header.columns.values)
        df_ = pd.read_csv(input_file, header=None, names=header, **kw)
    df = pd.concat(df_)
    df.index.name = 'time'
    if numeric is True:
        df = df.apply(manage_dtypes)           # try to make all fields numeric
    if local_time is True:
        df = df.tz_localize(attrs['local_timezone'])  # explain the local tz
        df = df.tz_convert('UTC')                  # convert df to UTC

    df = df.drop_duplicates()
    
    if split == 'daily':
        DFList = [g[1] for g in df.groupby([df.index.year, df.index.month, df.index.day])]
    elif split == 'monthly':
        DFList = [g[1] for g in df.groupby([df.index.year, df.index.month])]
    elif split =='yearly':
        DFList = [g[1] for g in df.groupby([df.index.year])]
    else:
        DFList = [df]
            
    return DFList

In [196]:
def get_ncnames(DFList, split=None, root=datafiles[0]):
    '''Create a list of netcdf filenames correspoding to days in DFList.'''
    ncfilenames = []
    for i, df in enumerate(DFList):
        y = int(round(df.index.year.mean()))
        m = int(round(df.index.month.mean()))
        d = int(round(df.index.day.mean()))
        if split == 'daily':
            ncfilenames.append('%s_%04d_%02d_%02d.nc' % (root, y, m, d))
        elif split == 'monthly':
            ncfilenames.append('%s_%04d_%02d.nc' % (root, y, m))
        elif split == 'yearly':
            ncfilenames.append('%s_%04d.nc' % (root, y))   
        else:
            ncfilenames.append(guess(df, root, y, m, d))
    return ncfilenames
        

In [197]:
def guess(df, root, y, m, d):
    dates = [g[0] for g in df.groupby([df.index.year, df.index.month, df.index.day])]
    if 0<len(dates)<=2:
        ncfilename = '%s_%04d_%02d_%02d.nc' % (root, y, m, d)
    elif 27<len(dates)<33:
        i = 0
        for dd, mm, yy in dates:
            if mm == m:
                i+=1
        if i >= len(dates)-3:
            ncfilename = '%s_%04d_%02d.nc' % (root, y, m)
    else:
        ncfilename = '%s_%04d_%02d_%02d_%04d_%02d_%02d.nc' % (root, dates[0][0], dates[0][1], dates[0][2], dates[-1][0], dates[-1][1], dates[-1][2])
    return ncfilename
        

In [198]:
def manage_dtypes(x):
    '''Ensure that all data columns are either intergers or floats'''
    if x.values.dtype == np.dtype('int64') or x.values.dtype == np.dtype('<M8[ns]'):
        return x
    else:
        try:
            return x.astype('float64')
        except:
            return x

In [199]:
def has_header(input_dict):
    '''Update input_dict to reflect whether or not a file has a header.'''
    input_file = posixpath.join(input_dict['path'], input_dict['filename'])
    header = list(pd.read_csv(input_file, nrows=1).columns.values)
    if header[0] == 'TOA5':
        return input_dict.update({'has_header': True,
                                  'header_file': input_file})
    else:
        header_file = posixpath.join(input_dict['path'], 'header.txt')
        if os.path.isfile(header_file):
            pass
        else:
            print('cannot find header file')
            header_file = None
        return input_dict.update({'has_header': False,
                                  'header_file': header_file})

In [200]:
def get_attrs(header_file, attrs):
    '''Augment attributes by inspecting the header of the .dat file.'''
    # metadata needs to be CF-1.6 compliant
    # in the TOA5 datafile headers, some metadata are written in the 1st row
    meta_list = eval(open(header_file).readlines()[0])

    attrs.update({'format': meta_list[0],
                  'logger': meta_list[1],
                  'datafile': meta_list[7]})
    if ':' in  meta_list[5]:
        attrs.update({'program': meta_list[5].split(':')[1]})
    else:
        attrs.update({'program': meta_list[5]})
    source_info = (attrs['logger'], attrs['datafile'], attrs['program'],
                   meta_list[6])
    source = 'Flux tower sensor data %s_%s.dat, %s, %s' % source_info
    attrs.update({'source': source})

    # the local attributes are in the 2nd, 3rd, and 4th rows
    df_names = pd.read_csv(header_file, skiprows=[0], nrows=2, dtype='str')
    df_names = df_names.astype('str')
    df_names.index = ('units', 'comment')
    local_attrs = df_names.to_dict()

    return attrs, local_attrs

In [247]:
def make_MultiIndex(df, site):
    '''Replace pre-existing time index with multidim site and time index.'''  
    indices = [site, df.index]
    new_index = pd.MultiIndex.from_product(indices, names=['site', 'time'])
    df.set_index(new_index, inplace=True)
    print new_index
    return df

In [248]:
def createDS(df, input_dict, attrs, local_attrs, site, coords):
    '''Create an xray.Dataset object from dataframe and dicts of parts'''        
    try:
        df = make_MultiIndex(df, site)
        ds = xray.Dataset.from_dataframe(df)
    except:
        ds = None
        print 'contains non-identical overlapping data --> throw away'
        print 'dataset is None'
        return ds
    ds.attrs.update(attrs)
    ds.coords.update(coords)
    for name in ds.data_vars.keys():
        ds[name].attrs = local_attrs[name]
        ds[name].attrs.update(dict(content_coverage_type='physicalMeasurement'))
    return ds

In [249]:
input_dict = {'filename': files[1],
              'path': 'C:/Users/Julia/Documents/GitHub/campbellsci-tools/example_data/2007/'}

has_header(input_dict)
[df] = createDF(input_dict, attrs, local_time=False)
ncfilenames = get_ncnames([df])

In [250]:
attrs, local_attrs = get_attrs(input_dict['header_file'], attrs)

In [251]:
ds = createDS(df, input_dict, attrs, local_attrs, site, coords)

MultiIndex(levels=[[u'Broadmead'], [2007-08-01 00:00:00, 2007-08-01 00:05:00, 2007-08-01 00:10:00, 2007-08-01 00:15:00, 2007-08-01 00:20:00, 2007-08-01 00:25:00, 2007-08-01 00:30:00, 2007-08-01 00:35:00, 2007-08-01 00:40:00, 2007-08-01 00:45:00, 2007-08-01 00:50:00, 2007-08-01 00:55:00, 2007-08-01 01:00:00, 2007-08-01 01:05:00, 2007-08-01 01:10:00, 2007-08-01 01:15:00, 2007-08-01 01:20:00, 2007-08-01 01:25:00, 2007-08-01 01:30:00, 2007-08-01 01:35:00, 2007-08-01 01:40:00, 2007-08-01 01:45:00, 2007-08-01 01:50:00, 2007-08-01 01:55:00, 2007-08-01 02:00:00, 2007-08-01 02:05:00, 2007-08-01 02:10:00, 2007-08-01 02:15:00, 2007-08-01 02:20:00, 2007-08-01 02:25:00, 2007-08-01 02:30:00, 2007-08-01 02:35:00, 2007-08-01 02:40:00, 2007-08-01 02:45:00, 2007-08-01 02:50:00, 2007-08-01 02:55:00, 2007-08-01 03:00:00, 2007-08-01 03:05:00, 2007-08-01 03:10:00, 2007-08-01 03:15:00, 2007-08-01 03:20:00, 2007-08-01 03:25:00, 2007-08-01 03:30:00, 2007-08-01 03:35:00, 2007-08-01 03:40:00, 2007-08-01 03:45:00

In [275]:
df.index.levels[1]

DatetimeIndex(['2007-08-01 00:00:00', '2007-08-01 00:05:00',
               '2007-08-01 00:10:00', '2007-08-01 00:15:00',
               '2007-08-01 00:20:00', '2007-08-01 00:25:00',
               '2007-08-01 00:30:00', '2007-08-01 00:35:00',
               '2007-08-01 00:40:00', '2007-08-01 00:45:00',
               ...
               '2007-08-31 23:15:00', '2007-08-31 23:20:00',
               '2007-08-31 23:25:00', '2007-08-31 23:30:00',
               '2007-08-31 23:35:00', '2007-08-31 23:40:00',
               '2007-08-31 23:45:00', '2007-08-31 23:50:00',
               '2007-08-31 23:55:00', '2007-09-01 00:00:00'],
              dtype='datetime64[ns]', name=u'time', length=8929, freq='5T')

In [292]:
df.index.levels[0]

Index([u'Broadmead'], dtype='object', name=u'site')

In [294]:
##TODO: get the dtype of the site index to change in the dataframe. Possibly has to be done by first making the site a column. 

In [291]:
df.index.set_levels(df.index.levels[0].astype(str),level=0, inplace=True)

In [293]:
df.index.set_levels?

In [132]:
ds = xray.Dataset.from_dataframe(df)

In [276]:
def func(x):
    return x.astype('str')

In [None]:
df.index

In [240]:
ds.coords['site']

<xray.DataArray 'site' (site: 1)>
array(['Broadmead'], dtype=object)
Coordinates:
  * site       (site) object 'Broadmead'
    lat        (site) float64 40.35
    lon        (site) float64 74.64
    elevation  (site) int32 34

In [253]:
ds.coords['site'].dtype

dtype('O')

In [135]:
ds.to_netcdf(ncfilenames[0], mode='w')

In [None]:
# create figure with enough axes to hold all the data
cols = df.columns

fig = plt.figure()
n = int(np.ceil(len(cols)**.5))
for i, col in enumerate(cols):
    ax = fig.add_subplot(n, n, i+1)
    ax.plot(df[col])
    ax.set_title(col)
plt.show()

In [262]:
df.set_index?

In [267]:
df.index.levels[0].astype(str)

Index([u'Broadmead'], dtype='object', name=u'site')