In [5]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.datasets import fetch_olivetti_faces
from tensorpandas import TensorArray, TensorDtype

np.__version__, pd.__version__, sklearn.__version__

('1.19.2', '1.0.5', '0.23.2')

# Tensorpandas
Tensorpandas allows efficiently storing n-dimensional data in a Pandas DataFrame.
In this example, we store images with some associated metadata.

In [28]:
%%time
data = fetch_olivetti_faces()

CPU times: user 21.1 ms, sys: 5.25 ms, total: 26.3 ms
Wall time: 25.5 ms


In [29]:
data.images.shape, data.target.shape

((400, 64, 64), (400,))

In [30]:
df = pd.DataFrame({
    "image": TensorArray(data.images),
    "target": data.target
})
df.head()

Unnamed: 0,image,target
0,[[0.30991736 0.3677686 0.41735536 ... 0.371900...,0
1,[[0.45454547 0.47107437 0.5123967 ... 0.190082...,0
2,[[0.3181818 0.40082645 0.49173555 ... 0.400826...,0
3,[[0.1983471 0.19421488 0.19421488 ... 0.582644...,0
4,[[0.5 0.54545456 0.58264464 ... 0.223140...,0


## TensorDtype
The corresponding scalar type is itself an ndarray - in this case a 64x64 image

In [34]:
df.dtypes

image     Tensor[(64, 64)]
target               int64
dtype: object

In [35]:
df.memory_usage()

Index         128
image     6553600
target       3200
dtype: int64

## Access underlying data

In [48]:
df["mean"] = df.image.tensor.values.mean(axis=-1).mean(axis=-1)
df

Unnamed: 0,image,target,mean
0,[[0.30991736 0.3677686 0.41735536 ... 0.371900...,0,0.636847
1,[[0.45454547 0.47107437 0.5123967 ... 0.190082...,0,0.589404
2,[[0.3181818 0.40082645 0.49173555 ... 0.400826...,0,0.622834
3,[[0.1983471 0.19421488 0.19421488 ... 0.582644...,0,0.580938
4,[[0.5 0.54545456 0.58264464 ... 0.223140...,0,0.609376
...,...,...,...
395,[[0.40082645 0.49586776 0.57024795 ... 0.347107...,39,0.526275
396,[[0.3677686 0.3677686 0.35123968 ... 0.694214...,39,0.606065
397,[[0.5 0.53305787 0.607438 ... 0.285123...,39,0.516712
398,[[0.21487603 0.21900827 0.21900827 ... 0.714876...,39,0.600571


## Perform indexing/reshaping operations

In [55]:
df["light"] = df["mean"] > 0.5
df.set_index(["mean", "light"]).unstack()

Unnamed: 0_level_0,image,image,target,target
light,False,True,False,True
mean,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0.382309,[[0.20661157 0.20661157 0.20661157 ... 0.152892...,[[nan nan nan ... nan nan nan]  [nan nan nan .....,21.0,
0.388200,[[0.46694216 0.47520661 0.45454547 ... 0.561983...,[[nan nan nan ... nan nan nan]  [nan nan nan .....,33.0,
0.393329,[[0.13636364 0.14049587 0.18595041 ... 0.119834...,[[nan nan nan ... nan nan nan]  [nan nan nan .....,21.0,
0.394009,[[0.15289256 0.18181819 0.25206611 ... 0.123966...,[[nan nan nan ... nan nan nan]  [nan nan nan .....,21.0,
0.394940,[[0.14876033 0.1570248 0.19008264 ... 0.111570...,[[nan nan nan ... nan nan nan]  [nan nan nan .....,38.0,
...,...,...,...,...
0.705760,[[nan nan nan ... nan nan nan]  [nan nan nan .....,[[0.33057851 0.40909091 0.54958677 ... 0.723140...,,5.0
0.712098,[[nan nan nan ... nan nan nan]  [nan nan nan .....,[[0.63636363 0.67768598 0.66942149 ... 0.698347...,,5.0
0.713822,[[nan nan nan ... nan nan nan]  [nan nan nan .....,[[0.55371898 0.60743803 0.63636363 ... 0.719008...,,5.0
0.717360,[[nan nan nan ... nan nan nan]  [nan nan nan .....,[[0.72314048 0.71074378 0.71487606 ... 0.636363...,,5.0


### Save/load

In [59]:
df.to_parquet("faces.parquet")
pd.read_parquet("faces.parquet").head()

Unnamed: 0,image,target,mean,light
0,[[0.30991736 0.3677686 0.41735536 ... 0.371900...,0,0.636847,True
1,[[0.45454547 0.47107437 0.5123967 ... 0.190082...,0,0.589404,True
2,[[0.3181818 0.40082645 0.49173555 ... 0.400826...,0,0.622834,True
3,[[0.1983471 0.19421488 0.19421488 ... 0.582644...,0,0.580938,True
4,[[0.5 0.54545456 0.58264464 ... 0.223140...,0,0.609376,True


In [61]:
df.to_parquet("faces.pickle")
pd.read_parquet("faces.pickle").head()

Unnamed: 0,image,target,mean,light
0,[[0.30991736 0.3677686 0.41735536 ... 0.371900...,0,0.636847,True
1,[[0.45454547 0.47107437 0.5123967 ... 0.190082...,0,0.589404,True
2,[[0.3181818 0.40082645 0.49173555 ... 0.400826...,0,0.622834,True
3,[[0.1983471 0.19421488 0.19421488 ... 0.582644...,0,0.580938,True
4,[[0.5 0.54545456 0.58264464 ... 0.223140...,0,0.609376,True
