In [2]:
import xarray as xr
import numpy as np

# XArray

What its good for: we lose integer indexing and integer axis, and gain named everything we want. 

Pandas DataFrames have only two axes, sometimes we need more. And it helps if they have names instead of number. eg. we don't give our variables names like "3", why would we do that for a axis (dimension).

A benefit of this is that when we have binary operations between structures with differing dimensions, the new structure will have an axis for each dimension.

Eg. ` a = {'temp': 28 30 11}, b = {'time', 5 7}`

then $a \times b$ will have a shape of $3 \times 2$ with dimensions of `temp` and `time`.

As a bonus, given that DataArray and Dataset are generalizations of Series and DataFrames we can go back and forth quite easily (I think). Seems to work for 2 dim structs but borks on more (could be me).

Datasets are cool too. they are a dict of DataArrays and use a common indexing structure. Of note, some of the DataArrays may be subspaces of others but they all can be referenced with the same indices. The smaller structures get inflated to occupy the larger space.

# DataArray

In [10]:
a = xr.DataArray(np.random.rand(2,3), dims=['x','y'], coords={'x':[2,5],'y':[3,6,7]})
a

<xarray.DataArray (x: 2, y: 3)>
array([[0.033867, 0.71282 , 0.745471],
       [0.817841, 0.695082, 0.069959]])
Coordinates:
  * x        (x) int64 2 5
  * y        (y) int64 3 6 7

In [58]:
b = xr.DataArray(np.random.rand(2,3), dims=['x','y'], coords={'x':[2,5],'y':[3,6,7]})
b

<xarray.DataArray (x: 2, y: 3)>
array([[0.538653, 0.136457, 0.221553],
       [0.753886, 0.067598, 0.474868]])
Coordinates:
  * x        (x) int64 2 5
  * y        (y) int64 3 6 7

In [59]:
a * b

<xarray.DataArray (x: 2, y: 3)>
array([[0.018243, 0.097269, 0.165162],
       [0.616559, 0.046986, 0.033221]])
Coordinates:
  * x        (x) int64 2 5
  * y        (y) int64 3 6 7

In [61]:
c = xr.DataArray(np.random.rand(2,3), dims=['z','y'], coords={'z':[2,5],'y':[3,6,7]})
c

<xarray.DataArray (z: 2, y: 3)>
array([[0.253674, 0.833291, 0.119795],
       [0.815215, 0.736067, 0.962305]])
Coordinates:
  * z        (z) int64 2 5
  * y        (y) int64 3 6 7

In [63]:
c.to_pandas()

y,3,6,7
z,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0.253674,0.833291,0.119795
5,0.815215,0.736067,0.962305


In [64]:
(a * c).to_pandas()

TypeError: Panel() takes no arguments

In [17]:
a[0,0]

<xarray.DataArray ()>
array(0.033867)
Coordinates:
    x        int64 2
    y        int64 3

In [18]:
a.loc[5,6]

<xarray.DataArray ()>
array(0.695082)
Coordinates:
    x        int64 5
    y        int64 6

In [22]:
a.sel(x=5, y=7)

<xarray.DataArray ()>
array(0.069959)
Coordinates:
    x        int64 5
    y        int64 7

In [26]:
data = xr.DataArray(np.random.randn(2, 3),
                      dims=('x', 'y'),
                       coords={'x': [10, 20]})

In [43]:
data[0,0]

<xarray.DataArray ()>
array(-0.42923)
Coordinates:
    x        int64 10

In [44]:
data[1,:]

<xarray.DataArray (y: 3)>
array([ 0.157318, -0.627474, -1.12968 ])
Coordinates:
    x        int64 20
Dimensions without coordinates: y

In [45]:
data.sel(x=10,y=2)

<xarray.DataArray ()>
array(0.222201)
Coordinates:
    x        int64 10

In [50]:
data.loc[{'x':10}]

<xarray.DataArray (y: 3)>
array([-0.42923 ,  1.861871,  0.222201])
Coordinates:
    x        int64 10
Dimensions without coordinates: y

In [52]:
data.loc[{'y':[1,0]}]

<xarray.DataArray (x: 2, y: 2)>
array([[ 1.861871, -0.42923 ],
       [-0.627474,  0.157318]])
Coordinates:
  * x        (x) int64 10 20
Dimensions without coordinates: y

In [53]:
data.loc[{'y':[1,0],'x':10}]

<xarray.DataArray (y: 2)>
array([ 1.861871, -0.42923 ])
Coordinates:
    x        int64 10
Dimensions without coordinates: y

In [55]:
data.sel(x=[20,10],y=[0,2])

<xarray.DataArray (x: 2, y: 2)>
array([[ 0.157318, -1.12968 ],
       [-0.42923 ,  0.222201]])
Coordinates:
  * x        (x) int64 20 10
Dimensions without coordinates: y

In [56]:
data.to_series()

x   y
10  0   -0.429230
    1    1.861871
    2    0.222201
20  0    0.157318
    1   -0.627474
    2   -1.129680
dtype: float64

In [57]:
data.to_pandas()

y,0,1,2
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,-0.42923,1.861871,0.222201
20,0.157318,-0.627474,-1.12968


# Dataset

In [30]:
ds = xr.Dataset({'foo': data, 'bar': ('x', [1, 2]), 'baz': np.pi})
ds

<xarray.Dataset>
Dimensions:  (x: 2, y: 3)
Coordinates:
  * x        (x) int64 10 20
Dimensions without coordinates: y
Data variables:
    foo      (x, y) float64 -0.4292 1.862 0.2222 0.1573 -0.6275 -1.13
    bar      (x) int64 1 2
    baz      float64 3.142

In [28]:
ds.foo

<xarray.DataArray 'foo' (x: 2, y: 3)>
array([[-0.42923 ,  1.861871,  0.222201],
       [ 0.157318, -0.627474, -1.12968 ]])
Coordinates:
  * x        (x) int64 10 20
Dimensions without coordinates: y

In [29]:
ds.bar

<xarray.DataArray 'bar' (x: 2)>
array([1, 2])
Coordinates:
  * x        (x) int64 10 20

In [32]:
ds.baz

<xarray.DataArray 'baz' ()>
array(3.141593)

In [38]:
ds.foo.sel(x=10)

<xarray.DataArray 'foo' (y: 3)>
array([-0.42923 ,  1.861871,  0.222201])
Coordinates:
    x        int64 10
Dimensions without coordinates: y

In [39]:
ds.sel(x=10)

<xarray.Dataset>
Dimensions:  (y: 3)
Coordinates:
    x        int64 10
Dimensions without coordinates: y
Data variables:
    foo      (y) float64 -0.4292 1.862 0.2222
    bar      int64 1
    baz      float64 3.142

In [41]:
ds.sel(x=20, y=1)

<xarray.Dataset>
Dimensions:  ()
Coordinates:
    x        int64 20
Data variables:
    foo      float64 -0.6275
    bar      int64 2
    baz      float64 3.142