In [1]:
import xarray as xr
import pandas as pd

# Moving from csv to netCDF file

In [2]:
# parse_dates={'Dates': ['YYYY','MM','DD']} tells pandas to combine these three columns together,
#    interpret as datetimes, and name the new column 'Dates'
# `keep_date_col=True` retains columns 'YYYY','MM','DD' even though they were combined due to `parse_dates`
df = pd.read_csv('Alabama_sam_10_lines_12_col_1981.csv', parse_dates={'Dates': ['YYYY','MM','DD']}, keep_date_col=True)
df

Unnamed: 0,Dates,Sample_nr,Sample_Code,FKEY,YYYY,MM,DD,Station_Code,Gear_Code,Gear,Salinity,Temperature,DO
0,1981-02-03,0,1,1981.02.03.0007.01,1981,2,3,7,1,Trawl (16 ft),25,11,8.2
1,1981-02-03,1,1,1981.02.03.0007.01,1981,2,3,7,1,Trawl (16 ft),25,11,8.2
2,1981-02-03,2,1,1981.02.03.0007.01,1981,2,3,7,1,Trawl (16 ft),25,11,8.2
3,1981-02-03,3,1,1981.02.03.0007.01,1981,2,3,7,1,Trawl (16 ft),25,11,8.2
4,1981-02-03,4,1,1981.02.03.0007.01,1981,2,3,7,1,Trawl (16 ft),25,11,8.2
5,1981-02-03,5,1,1981.02.03.0007.01,1981,2,3,7,1,Trawl (16 ft),25,11,8.2
6,1981-02-03,6,1,1981.02.03.0007.01,1981,2,3,7,1,Trawl (16 ft),25,11,8.2
7,1981-02-03,7,1,1981.02.03.0007.01,1981,2,3,7,1,Trawl (16 ft),25,11,8.2
8,1981-02-03,8,1,1981.02.03.0007.01,1981,2,3,7,1,Trawl (16 ft),25,11,8.2
9,1981-02-03,9,1,1981.02.03.0007.01,1981,2,3,7,1,Trawl (16 ft),25,11,8.2


In [3]:
type(df.Gear[0])

str

In [4]:
# hour, min, sec by hand
hour = 12
minute = 0
second = 0

In [5]:
# add on time info
df.Dates = df.Dates + pd.Timedelta('%i hours, %i minutes, %i sec' % (hour, minute, second))

Change from pandas DataFrame to xarray Dataset

In [6]:
ds = df.to_xarray()

xarray is made to deal with multi-dimensional data in a pandas-like way, and it's particularly good at netCDF files.

We can add a dimension to the dataset (for the trajectory) with the following. After adding the dimension "trajectory", we also have to transpose so that the dimensions are in the desired order.

In [7]:
ds.Sample_nr

<xarray.DataArray 'Sample_nr' (index: 11)>
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10

In [8]:
ds = ds.expand_dims('trajectory').transpose()

In [9]:
ds.Sample_nr

<xarray.DataArray 'Sample_nr' (index: 11, trajectory: 1)>
array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10]])
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory

Checking on desired outcomes

For each type, showing presence in the xarray Dataset

In [10]:
ds.Sample_nr[:,0]

<xarray.DataArray 'Sample_nr' (index: 11)>
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10

In [11]:
ds['Sample_Code'][:,0]

<xarray.DataArray 'Sample_Code' (index: 11)>
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10

In [12]:
ds.FKEY

<xarray.DataArray 'FKEY' (index: 11, trajectory: 1)>
array([['1981.02.03.0007.01'],
       ['1981.02.03.0007.01'],
       ['1981.02.03.0007.01'],
       ['1981.02.03.0007.01'],
       ['1981.02.03.0007.01'],
       ['1981.02.03.0007.01'],
       ['1981.02.03.0007.01'],
       ['1981.02.03.0007.01'],
       ['1981.02.03.0007.01'],
       ['1981.02.03.0007.01'],
       ['1981.02.03.0007.01']], dtype=object)
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory

In [13]:
ds.YYYY

<xarray.DataArray 'YYYY' (index: 11, trajectory: 1)>
array([['1981'],
       ['1981'],
       ['1981'],
       ['1981'],
       ['1981'],
       ['1981'],
       ['1981'],
       ['1981'],
       ['1981'],
       ['1981'],
       ['1981']], dtype=object)
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory

In [14]:
ds.MM

<xarray.DataArray 'MM' (index: 11, trajectory: 1)>
array([['2'],
       ['2'],
       ['2'],
       ['2'],
       ['2'],
       ['2'],
       ['2'],
       ['2'],
       ['2'],
       ['2'],
       ['2']], dtype=object)
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory

In [15]:
ds.DD

<xarray.DataArray 'DD' (index: 11, trajectory: 1)>
array([['3'],
       ['3'],
       ['3'],
       ['3'],
       ['3'],
       ['3'],
       ['3'],
       ['3'],
       ['3'],
       ['3'],
       ['3']], dtype=object)
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory

In [16]:
dateobj = df.Dates

In [17]:
dateobj

0    1981-02-03 12:00:00
1    1981-02-03 12:00:00
2    1981-02-03 12:00:00
3    1981-02-03 12:00:00
4    1981-02-03 12:00:00
5    1981-02-03 12:00:00
6    1981-02-03 12:00:00
7    1981-02-03 12:00:00
8    1981-02-03 12:00:00
9    1981-02-03 12:00:00
10   1981-02-03 12:00:00
Name: Dates, dtype: datetime64[ns]

In [18]:
# hours since '1970-1-1'
time = (dateobj[0] - pd.to_datetime('1970-01-01')).total_seconds()/3600
time

97236.0

In [19]:
# seconds since '1970-1-1'
times = (dateobj[0] - pd.to_datetime('1970-01-01')).total_seconds()
times

350049600.0

In [20]:
ds.Station_Code.shape

(11, 1)

In [21]:
ds.Gear_Code

<xarray.DataArray 'Gear_Code' (index: 11, trajectory: 1)>
array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory

In [22]:
ds.Gear

<xarray.DataArray 'Gear' (index: 11, trajectory: 1)>
array([['Trawl (16 ft)'],
       ['Trawl (16 ft)'],
       ['Trawl (16 ft)'],
       ['Trawl (16 ft)'],
       ['Trawl (16 ft)'],
       ['Trawl (16 ft)'],
       ['Trawl (16 ft)'],
       ['Trawl (16 ft)'],
       ['Trawl (16 ft)'],
       ['Trawl (16 ft)'],
       ['Trawl (16 ft)']], dtype=object)
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory

In [23]:
ds.Salinity

<xarray.DataArray 'Salinity' (index: 11, trajectory: 1)>
array([[25],
       [25],
       [25],
       [25],
       [25],
       [25],
       [25],
       [25],
       [25],
       [25],
       [25]])
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory

In [24]:
ds.Temperature

<xarray.DataArray 'Temperature' (index: 11, trajectory: 1)>
array([[11],
       [11],
       [11],
       [11],
       [11],
       [11],
       [11],
       [11],
       [11],
       [11],
       [11]])
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory

In [25]:
ds.DO

<xarray.DataArray 'DO' (index: 11, trajectory: 1)>
array([[8.2],
       [8.2],
       [8.2],
       [8.2],
       [8.2],
       [8.2],
       [8.2],
       [8.2],
       [8.2],
       [8.2],
       [8.2]])
Coordinates:
  * index    (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory

Can add/change metadata to your xarray Dataset too. Use the following form to add attributes.

In [26]:
ds.attrs['history'] = 'Detailed account of history'

In [27]:
ds

<xarray.Dataset>
Dimensions:       (index: 11, trajectory: 1)
Coordinates:
  * index         (index) int64 0 1 2 3 4 5 6 7 8 9 10
Dimensions without coordinates: trajectory
Data variables:
    Dates         (index, trajectory) datetime64[ns] 1981-02-03T12:00:00 ... 1981-02-03T12:00:00
    Sample_nr     (index, trajectory) int64 0 1 2 3 4 5 6 7 8 9 10
    Sample_Code   (index, trajectory) int64 1 1 1 1 1 1 1 1 1 1 1
    FKEY          (index, trajectory) object '1981.02.03.0007.01' ... '1981.02.03.0007.01'
    YYYY          (index, trajectory) object '1981' '1981' ... '1981' '1981'
    MM            (index, trajectory) object '2' '2' '2' '2' ... '2' '2' '2' '2'
    DD            (index, trajectory) object '3' '3' '3' '3' ... '3' '3' '3' '3'
    Station_Code  (index, trajectory) int64 7 7 7 7 7 7 7 7 7 7 7
    Gear_Code     (index, trajectory) int64 1 1 1 1 1 1 1 1 1 1 1
    Gear          (index, trajectory) object 'Trawl (16 ft)' ... 'Trawl (16 ft)'
    Salinity      (index, trajectory) 

Once the data looks exactly the way you want it, you can export it with:

In [28]:
ds.to_netcdf('test.nc')

There are options available for output details, too.

You might be interested in using `dask` in the future to have multiple workers when working on large datasets.

In [29]:
type(ds.Gear[0].values[0])

str