In [12]:
import xarray as xr
import numpy as np
import torch

# XArray

What its good for: we lose integer indexing and integer axis, and gain named everything we want. 

Pandas DataFrames have only two axes, sometimes we need more. And it helps if they have names instead of number. eg. we don't give our variables names like "3", why would we do that for a axis (dimension).

A benefit of this is that when we have binary operations between structures with differing dimensions, the new structure will have an axis for each dimension.

Eg. ` a = {'temp': 28 30 11}, b = {'time', 5 7}`

then $a \times b$ will have a shape of $3 \times 2$ with dimensions of `temp` and `time`.

As a bonus, given that DataArray and Dataset are generalizations of Series and DataFrames we can go back and forth quite easily (I think). Seems to work for 2 dim structs but borks on more (could be me).

Datasets are cool too. they are a dict of DataArrays and use a common indexing structure. Of note, some of the DataArrays may be subspaces of others but they all can be referenced with the same indices. The smaller structures get inflated to occupy the larger space.

# DataArray

In [4]:
a = xr.DataArray(np.random.rand(2,3), dims=['x','y'], coords={'x':[2,5],'y':[3,6,7]})
a

<xarray.DataArray (x: 2, y: 3)>
array([[0.587723, 0.504564, 0.075407],
       [0.072651, 0.366445, 0.261264]])
Coordinates:
  * x        (x) int64 2 5
  * y        (y) int64 3 6 7

In [5]:
b = xr.DataArray(np.random.rand(2,3), dims=['x','y'], coords={'x':[2,5],'y':[3,6,7]})
b

<xarray.DataArray (x: 2, y: 3)>
array([[0.162281, 0.518604, 0.843994],
       [0.646497, 0.221922, 0.067562]])
Coordinates:
  * x        (x) int64 2 5
  * y        (y) int64 3 6 7

In [6]:
a * b

<xarray.DataArray (x: 2, y: 3)>
array([[0.095376, 0.261669, 0.063643],
       [0.046968, 0.081322, 0.017652]])
Coordinates:
  * x        (x) int64 2 5
  * y        (y) int64 3 6 7

In [7]:
c = xr.DataArray(np.random.rand(2,3), dims=['z','y'], coords={'z':[2,5],'y':[3,6,7]})
c

<xarray.DataArray (z: 2, y: 3)>
array([[0.546055, 0.057681, 0.536134],
       [0.378684, 0.630938, 0.557293]])
Coordinates:
  * z        (z) int64 2 5
  * y        (y) int64 3 6 7

In [8]:
c.to_pandas()

y,3,6,7
z,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0.546055,0.057681,0.536134
5,0.378684,0.630938,0.557293


In [10]:
a*c

<xarray.DataArray (x: 2, y: 3, z: 2)>
array([[[0.320929, 0.222561],
        [0.029104, 0.318348],
        [0.040428, 0.042024]],

       [[0.039671, 0.027512],
        [0.021137, 0.231204],
        [0.140072, 0.145601]]])
Coordinates:
  * x        (x) int64 2 5
  * y        (y) int64 3 6 7
  * z        (z) int64 2 5

In [9]:
(a * c).to_pandas() # should die cuz we've got 3 dims

TypeError: Panel() takes no arguments

In [17]:
a[0,0]

<xarray.DataArray ()>
array(0.033867)
Coordinates:
    x        int64 2
    y        int64 3

In [18]:
a.loc[5,6]

<xarray.DataArray ()>
array(0.695082)
Coordinates:
    x        int64 5
    y        int64 6

In [22]:
a.sel(x=5, y=7)

<xarray.DataArray ()>
array(0.069959)
Coordinates:
    x        int64 5
    y        int64 7

In [26]:
data = xr.DataArray(np.random.randn(2, 3),
                      dims=('x', 'y'),
                       coords={'x': [10, 20]})

In [43]:
data[0,0]

<xarray.DataArray ()>
array(-0.42923)
Coordinates:
    x        int64 10

In [44]:
data[1,:]

<xarray.DataArray (y: 3)>
array([ 0.157318, -0.627474, -1.12968 ])
Coordinates:
    x        int64 20
Dimensions without coordinates: y

In [45]:
data.sel(x=10,y=2)

<xarray.DataArray ()>
array(0.222201)
Coordinates:
    x        int64 10

In [50]:
data.loc[{'x':10}]

<xarray.DataArray (y: 3)>
array([-0.42923 ,  1.861871,  0.222201])
Coordinates:
    x        int64 10
Dimensions without coordinates: y

In [52]:
data.loc[{'y':[1,0]}]

<xarray.DataArray (x: 2, y: 2)>
array([[ 1.861871, -0.42923 ],
       [-0.627474,  0.157318]])
Coordinates:
  * x        (x) int64 10 20
Dimensions without coordinates: y

In [53]:
data.loc[{'y':[1,0],'x':10}]

<xarray.DataArray (y: 2)>
array([ 1.861871, -0.42923 ])
Coordinates:
    x        int64 10
Dimensions without coordinates: y

In [55]:
data.sel(x=[20,10],y=[0,2])

<xarray.DataArray (x: 2, y: 2)>
array([[ 0.157318, -1.12968 ],
       [-0.42923 ,  0.222201]])
Coordinates:
  * x        (x) int64 20 10
Dimensions without coordinates: y

In [56]:
data.to_series()

x   y
10  0   -0.429230
    1    1.861871
    2    0.222201
20  0    0.157318
    1   -0.627474
    2   -1.129680
dtype: float64

In [57]:
data.to_pandas()

y,0,1,2
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,-0.42923,1.861871,0.222201
20,0.157318,-0.627474,-1.12968


## DataArray with pos data

In [17]:
n=30
steps = 32
# create random walk data
pos = torch.empty(n, steps).uniform_(-1, 1).cumsum(1)

# make the first position 0
pos = pos - pos[:,0].unsqueeze(1)

pos_da = xr.DataArray(pos.numpy(), dims=['particles','steps'])
pos_da

<xarray.DataArray (particles: 30, steps: 32)>
array([[ 0.      ,  0.481762,  0.979465, ...,  2.880876,  2.447023,  1.755813],
       [ 0.      ,  0.697417,  1.028486, ..., -0.930695, -0.71317 , -0.122509],
       [ 0.      , -0.312168, -0.708338, ..., -1.358308, -0.95414 , -0.597796],
       ...,
       [ 0.      , -0.961562, -0.274019, ..., -1.467502, -1.087679, -1.762288],
       [ 0.      , -0.517152, -0.619513, ...,  0.205468,  0.241352,  0.656014],
       [ 0.      ,  0.787382,  0.903233, ...,  1.196427,  0.621885,  1.556724]],
      dtype=float32)
Dimensions without coordinates: particles, steps

In [21]:
pos_da.loc[2:5, 0:4]

<xarray.DataArray (particles: 3, steps: 4)>
array([[ 0.      , -0.312168, -0.708338, -0.88591 ],
       [ 0.      ,  0.375965,  0.374686,  1.074824],
       [ 0.      ,  0.490305,  1.039249,  0.980105]], dtype=float32)
Dimensions without coordinates: particles, steps

In [19]:
pos_da.loc[:3, :4]

<xarray.DataArray (particles: 3, steps: 4)>
array([[ 0.      ,  0.481762,  0.979465,  1.368937],
       [ 0.      ,  0.697417,  1.028486,  1.763603],
       [ 0.      , -0.312168, -0.708338, -0.88591 ]], dtype=float32)
Dimensions without coordinates: particles, steps

In [25]:
pos_da.sel(particles=slice(0,3), steps=[0,1,2,3])

<xarray.DataArray (particles: 3, steps: 4)>
array([[ 0.      ,  0.481762,  0.979465,  1.368937],
       [ 0.      ,  0.697417,  1.028486,  1.763603],
       [ 0.      , -0.312168, -0.708338, -0.88591 ]], dtype=float32)
Dimensions without coordinates: particles, steps

In [27]:
pos_da.sel(particles=slice(0,3), steps=[0,1,2,3]).sel(particles=2)

<xarray.DataArray (steps: 4)>
array([ 0.      , -0.312168, -0.708338, -0.88591 ], dtype=float32)
Dimensions without coordinates: steps

In [28]:

poscoord_da = xr.DataArray(pos.numpy(), coords = [('particles', np.arange(30)),('steps',np.arange(32))])
poscoord_da

<xarray.DataArray (particles: 30, steps: 32)>
array([[ 0.      ,  0.481762,  0.979465, ...,  2.880876,  2.447023,  1.755813],
       [ 0.      ,  0.697417,  1.028486, ..., -0.930695, -0.71317 , -0.122509],
       [ 0.      , -0.312168, -0.708338, ..., -1.358308, -0.95414 , -0.597796],
       ...,
       [ 0.      , -0.961562, -0.274019, ..., -1.467502, -1.087679, -1.762288],
       [ 0.      , -0.517152, -0.619513, ...,  0.205468,  0.241352,  0.656014],
       [ 0.      ,  0.787382,  0.903233, ...,  1.196427,  0.621885,  1.556724]],
      dtype=float32)
Coordinates:
  * particles  (particles) int64 0 1 2 3 4 5 6 7 8 ... 22 23 24 25 26 27 28 29
  * steps      (steps) int64 0 1 2 3 4 5 6 7 8 9 ... 23 24 25 26 27 28 29 30 31

In [57]:
poscoord_da.loc[3:5].loc[3]

<xarray.DataArray (steps: 32)>
array([ 0.      ,  0.375965,  0.374686,  1.074824,  0.26203 ,  0.889337,
        0.519092, -0.417865, -0.920426, -1.324418, -1.270773, -1.873123,
       -1.070355, -0.430565,  0.243643,  0.229672, -0.440287, -1.031463,
       -0.22437 ,  0.265713, -0.209984,  0.59695 ,  1.378502,  1.630596,
        1.894357,  2.730291,  3.101273,  3.65379 ,  3.169738,  2.391841,
        1.649172,  1.358656], dtype=float32)
Coordinates:
    particles  int64 3
  * steps      (steps) int64 0 1 2 3 4 5 6 7 8 9 ... 23 24 25 26 27 28 29 30 31

**Note** `.isel` ignores coords names and just counts top to bottom.

In [62]:
poscoord_da.loc[3:5].isel(particles=0)

<xarray.DataArray (steps: 32)>
array([ 0.      ,  0.375965,  0.374686,  1.074824,  0.26203 ,  0.889337,
        0.519092, -0.417865, -0.920426, -1.324418, -1.270773, -1.873123,
       -1.070355, -0.430565,  0.243643,  0.229672, -0.440287, -1.031463,
       -0.22437 ,  0.265713, -0.209984,  0.59695 ,  1.378502,  1.630596,
        1.894357,  2.730291,  3.101273,  3.65379 ,  3.169738,  2.391841,
        1.649172,  1.358656], dtype=float32)
Coordinates:
    particles  int64 3
  * steps      (steps) int64 0 1 2 3 4 5 6 7 8 9 ... 23 24 25 26 27 28 29 30 31

In [41]:
poscoord_da.sel(particles=slice(3,6)).sel(particles=slice(2,4))

<xarray.DataArray (particles: 2, steps: 32)>
array([[ 0.      ,  0.375965,  0.374686,  1.074824,  0.26203 ,  0.889337,
         0.519092, -0.417865, -0.920426, -1.324418, -1.270773, -1.873123,
        -1.070355, -0.430565,  0.243643,  0.229672, -0.440287, -1.031463,
        -0.22437 ,  0.265713, -0.209984,  0.59695 ,  1.378502,  1.630596,
         1.894357,  2.730291,  3.101273,  3.65379 ,  3.169738,  2.391841,
         1.649172,  1.358656],
       [ 0.      ,  0.490305,  1.039249,  0.980105,  1.423616,  0.90635 ,
         0.830469,  1.021958,  1.598376,  0.623266,  0.674153,  1.013652,
         1.668581,  2.651549,  2.583146,  2.138184,  2.03307 ,  1.17687 ,
         1.354423,  0.897457,  0.216818, -0.056382,  0.277388,  0.055566,
        -0.587838,  0.111196, -0.855095, -0.555146, -0.072044,  0.764843,
         0.507337,  0.641741]], dtype=float32)
Coordinates:
  * particles  (particles) int64 3 4
  * steps      (steps) int64 0 1 2 3 4 5 6 7 8 9 ... 23 24 25 26 27 28 29 30 31

In [46]:
(poscoord_da > 0.0).any(dim="particles")

<xarray.DataArray (steps: 32)>
array([False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])
Coordinates:
  * steps    (steps) int64 0 1 2 3 4 5 6 7 8 9 ... 22 23 24 25 26 27 28 29 30 31

In [47]:
(poscoord_da > 0.0).any(dim="steps")

<xarray.DataArray (particles: 30)>
array([ True,  True, False,  True,  True, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,  True])
Coordinates:
  * particles  (particles) int64 0 1 2 3 4 5 6 7 8 ... 22 23 24 25 26 27 28 29

### this is the one, the big deal!!

In [50]:
poscoord_da.loc[(poscoord_da > 0.0).any(dim="steps")]

<xarray.DataArray (particles: 24, steps: 32)>
array([[ 0.      ,  0.481762,  0.979465, ...,  2.880876,  2.447023,  1.755813],
       [ 0.      ,  0.697417,  1.028486, ..., -0.930695, -0.71317 , -0.122509],
       [ 0.      ,  0.375965,  0.374686, ...,  2.391841,  1.649172,  1.358656],
       ...,
       [ 0.      , -0.961562, -0.274019, ..., -1.467502, -1.087679, -1.762288],
       [ 0.      , -0.517152, -0.619513, ...,  0.205468,  0.241352,  0.656014],
       [ 0.      ,  0.787382,  0.903233, ...,  1.196427,  0.621885,  1.556724]],
      dtype=float32)
Coordinates:
  * particles  (particles) int64 0 1 3 4 8 9 10 11 ... 20 21 23 25 26 27 28 29
  * steps      (steps) int64 0 1 2 3 4 5 6 7 8 9 ... 23 24 25 26 27 28 29 30 31

In [51]:
pred = (poscoord_da > 0.0).any(dim="steps")
poscoord_da.loc[pred]

<xarray.DataArray (particles: 24, steps: 32)>
array([[ 0.      ,  0.481762,  0.979465, ...,  2.880876,  2.447023,  1.755813],
       [ 0.      ,  0.697417,  1.028486, ..., -0.930695, -0.71317 , -0.122509],
       [ 0.      ,  0.375965,  0.374686, ...,  2.391841,  1.649172,  1.358656],
       ...,
       [ 0.      , -0.961562, -0.274019, ..., -1.467502, -1.087679, -1.762288],
       [ 0.      , -0.517152, -0.619513, ...,  0.205468,  0.241352,  0.656014],
       [ 0.      ,  0.787382,  0.903233, ...,  1.196427,  0.621885,  1.556724]],
      dtype=float32)
Coordinates:
  * particles  (particles) int64 0 1 3 4 8 9 10 11 ... 20 21 23 25 26 27 28 29
  * steps      (steps) int64 0 1 2 3 4 5 6 7 8 9 ... 23 24 25 26 27 28 29 30 31

In [67]:
pred_ba = (poscoord_da > 0.0).any(dim="steps")
poscoord_da.loc[dict(particles=pred_ba)]

<xarray.DataArray (particles: 24, steps: 32)>
array([[ 0.      ,  0.481762,  0.979465, ...,  2.880876,  2.447023,  1.755813],
       [ 0.      ,  0.697417,  1.028486, ..., -0.930695, -0.71317 , -0.122509],
       [ 0.      ,  0.375965,  0.374686, ...,  2.391841,  1.649172,  1.358656],
       ...,
       [ 0.      , -0.961562, -0.274019, ..., -1.467502, -1.087679, -1.762288],
       [ 0.      , -0.517152, -0.619513, ...,  0.205468,  0.241352,  0.656014],
       [ 0.      ,  0.787382,  0.903233, ...,  1.196427,  0.621885,  1.556724]],
      dtype=float32)
Coordinates:
  * particles  (particles) int64 0 1 3 4 8 9 10 11 ... 20 21 23 25 26 27 28 29
  * steps      (steps) int64 0 1 2 3 4 5 6 7 8 9 ... 23 24 25 26 27 28 29 30 31

In [68]:
pred_ba = (poscoord_da > 0.0).any(dim="steps")
poscoord_da.sel(particles=pred_ba)

<xarray.DataArray (particles: 24, steps: 32)>
array([[ 0.      ,  0.481762,  0.979465, ...,  2.880876,  2.447023,  1.755813],
       [ 0.      ,  0.697417,  1.028486, ..., -0.930695, -0.71317 , -0.122509],
       [ 0.      ,  0.375965,  0.374686, ...,  2.391841,  1.649172,  1.358656],
       ...,
       [ 0.      , -0.961562, -0.274019, ..., -1.467502, -1.087679, -1.762288],
       [ 0.      , -0.517152, -0.619513, ...,  0.205468,  0.241352,  0.656014],
       [ 0.      ,  0.787382,  0.903233, ...,  1.196427,  0.621885,  1.556724]],
      dtype=float32)
Coordinates:
  * particles  (particles) int64 0 1 3 4 8 9 10 11 ... 20 21 23 25 26 27 28 29
  * steps      (steps) int64 0 1 2 3 4 5 6 7 8 9 ... 23 24 25 26 27 28 29 30 31


# Dataset

In [30]:
ds = xr.Dataset({'foo': data, 'bar': ('x', [1, 2]), 'baz': np.pi})
ds

<xarray.Dataset>
Dimensions:  (x: 2, y: 3)
Coordinates:
  * x        (x) int64 10 20
Dimensions without coordinates: y
Data variables:
    foo      (x, y) float64 -0.4292 1.862 0.2222 0.1573 -0.6275 -1.13
    bar      (x) int64 1 2
    baz      float64 3.142

In [28]:
ds.foo

<xarray.DataArray 'foo' (x: 2, y: 3)>
array([[-0.42923 ,  1.861871,  0.222201],
       [ 0.157318, -0.627474, -1.12968 ]])
Coordinates:
  * x        (x) int64 10 20
Dimensions without coordinates: y

In [29]:
ds.bar

<xarray.DataArray 'bar' (x: 2)>
array([1, 2])
Coordinates:
  * x        (x) int64 10 20

In [32]:
ds.baz

<xarray.DataArray 'baz' ()>
array(3.141593)

In [38]:
ds.foo.sel(x=10)

<xarray.DataArray 'foo' (y: 3)>
array([-0.42923 ,  1.861871,  0.222201])
Coordinates:
    x        int64 10
Dimensions without coordinates: y

In [39]:
ds.sel(x=10)

<xarray.Dataset>
Dimensions:  (y: 3)
Coordinates:
    x        int64 10
Dimensions without coordinates: y
Data variables:
    foo      (y) float64 -0.4292 1.862 0.2222
    bar      int64 1
    baz      float64 3.142

In [41]:
ds.sel(x=20, y=1)

<xarray.Dataset>
Dimensions:  ()
Coordinates:
    x        int64 20
Data variables:
    foo      float64 -0.6275
    bar      int64 2
    baz      float64 3.142