# Xarray intro 
(adapted from offical intro at http://xarray.pydata.org/en/stable/examples/quick-overview.html )

In [2]:
import numpy as np
import pandas as pd
import xarray as xr
%autosave 300

Autosaving every 300 seconds


## The basics

In [3]:
xr.DataArray(np.random.randn(2, 3))

<xarray.DataArray (dim_0: 2, dim_1: 3)>
array([[ 1.00351 ,  1.830714,  0.067995],
       [-0.212296,  1.927962,  0.731023]])
Dimensions without coordinates: dim_0, dim_1

In [4]:
xr.DataArray(pd.Series(range(3), index=list('abc'), name='foo'))

<xarray.DataArray 'foo' (dim_0: 3)>
array([0, 1, 2], dtype=int64)
Coordinates:
  * dim_0    (dim_0) object 'a' 'b' 'c'

In [5]:
data = xr.DataArray(np.random.randn(2, 3), dims=('x', 'y'), coords={'x': ['a', 'b']},  attrs={'SomeMetadata':3})

In [6]:
#The raw data
data.values

array([[-1.38467848,  2.7541946 ,  0.37203802],
       [-1.41518232, -0.0096234 ,  0.97474787]])

In [7]:
#Shape of the data
data.shape

(2, 3)

In [9]:
#attrs holds an ordered dict of metadata/attributes
data.attrs

OrderedDict([('SomeMetadata', 3)])

In [8]:
#Names for each axis
data.dims

('x', 'y')

In [28]:
#Coordinates enable label based indexing
data.coords

Coordinates:
  * x        (x) <U1 'a' 'b'

## Indexing

In [24]:
#Dimensions accessed by order, normal 'numpy' style position indexing
data[0:, 0]

<xarray.DataArray (x: 2)>
array([-1.384678, -1.415182])
Coordinates:
  * x        (x) <U1 'a' 'b'
Attributes:
    SomeMetadata:  3

In [26]:
#Dimensions accessed by order, label + position based indexing, like pandas
data.loc['a':'b', :]

<xarray.DataArray (x: 2, y: 3)>
array([[-1.384678,  2.754195,  0.372038],
       [-1.415182, -0.009623,  0.974748]])
Coordinates:
  * x        (x) <U1 'a' 'b'
Dimensions without coordinates: y
Attributes:
    SomeMetadata:  3

In [27]:
#Use a name for the dimensions, and position indexing
data.isel(x=1, y=2)

<xarray.DataArray ()>
array(0.974748)
Coordinates:
    x        <U1 'b'
Attributes:
    SomeMetadata:  3

In [10]:
#Use a name for the dimensions, and label/position indexing
data.sel(x=slice('a','b'), y=2)
#Have to use slice explicitly for sel and isel

<xarray.DataArray (x: 2)>
array([0.372038, 0.974748])
Coordinates:
  * x        (x) <U1 'a' 'b'
Attributes:
    SomeMetadata:  3

## Computation

In [11]:
#Similar to normal numpy usage
(data*3 + 10).astype(np.int)

<xarray.DataArray (x: 2, y: 3)>
array([[ 5, 18, 11],
       [ 5,  9, 12]])
Coordinates:
  * x        (x) <U1 'a' 'b'
Dimensions without coordinates: y

In [12]:
#Can use dimensions names, rather than the axis as an integer
data.mean(dim='y')

<xarray.DataArray (x: 2)>
array([ 0.580518, -0.150019])
Coordinates:
  * x        (x) <U1 'a' 'b'

In [13]:
#Operations will align on dimension names and broadcast operations
a = xr.DataArray(np.random.randn(3, 4), dims=['y', 'z'])
print(a)
b = xr.DataArray(np.random.randn(4)*100, dims='z')
print(b)
# print('-'*20)
a+b

<xarray.DataArray (y: 3, z: 4)>
array([[-0.01313 , -1.428312, -3.013333,  0.370227],
       [-0.922954, -1.04306 , -0.469544,  0.043928],
       [-0.836342, -0.729875, -1.786962, -0.927389]])
Dimensions without coordinates: y, z
<xarray.DataArray (z: 4)>
array([  59.330986,   64.731226, -137.152917, -107.509423])
Dimensions without coordinates: z


<xarray.DataArray (y: 3, z: 4)>
array([[  59.317856,   63.302914, -140.16625 , -107.139195],
       [  58.408031,   63.688166, -137.622461, -107.465495],
       [  58.494644,   64.001352, -138.939879, -108.436812]])
Dimensions without coordinates: y, z

In [14]:
#Dimension sizes have to match, as would be expected
a = xr.DataArray(np.random.randn(4, 3), dims=['y', 'z'])
b = xr.DataArray(np.random.randn(4), dims='z')
a+b

ValueError: arguments without labels along dimension 'z' cannot be aligned because they have different dimension sizes: {3, 4}

## Datasets

In [15]:
#Dataset is a dict-like container of aligned DataArrays, think of it like a multidimensional pandas.Dataframe
ds = xr.Dataset({'foo': data, 'bar': ('x', [1, 2]), 'baz': np.pi})
ds

<xarray.Dataset>
Dimensions:  (x: 2, y: 3)
Coordinates:
  * x        (x) <U1 'a' 'b'
Dimensions without coordinates: y
Data variables:
    foo      (x, y) float64 -1.385 2.754 0.372 -1.415 -0.009623 0.9747
    baz      float64 3.142
    bar      (x) int32 1 2

In [16]:
#It stores multiple variables, of potentially different types, across a shared coordinate system
ds['foo']

<xarray.DataArray 'foo' (x: 2, y: 3)>
array([[-1.384678,  2.754195,  0.372038],
       [-1.415182, -0.009623,  0.974748]])
Coordinates:
  * x        (x) <U1 'a' 'b'
Dimensions without coordinates: y
Attributes:
    SomeMetadata:  3

In [22]:
ds.sel(x='a')

<xarray.Dataset>
Dimensions:  (y: 3)
Coordinates:
    x        <U1 'a'
Dimensions without coordinates: y
Data variables:
    foo      (y) float64 -1.385 2.754 0.372
    baz      float64 3.142
    bar      int32 1

### Of course there's reshaping/reorganizing and groupby operations 
### Xarray has good support for advanced indexing and broadcasting, but similar to to numpy/pandas, advanced indexing/broadcasting with assignments can be a little tricky
### Go read the docs! http://xarray.pydata.org/en/stable/index.html 