# Classification dataset and preprocessor usage

This notebook demonstrates some of the capabilities of the classification dataset and preprocessor components.

In [1]:
import numpy as np

from tsfm_public.toolkit.dataset import ClassificationDFDataset
from tsfm_public.toolkit.time_series_classification_preprocessor import TimeSeriesClassificationPreprocessor
from tsfm_public.toolkit.util import convert_tsfile_to_dataframe

## Case 1: Nested dataset
Dataset contains entries which are pandas series

In [2]:
path = "/Users/wmgifford/Downloads/BasicMotions/BasicMotions_TRAIN.ts"

In [3]:
df = convert_tsfile_to_dataframe(
    path,
    return_separate_X_and_y=False,
)
label_column = "class_vals"
input_columns = [f"dim_{i}" for i in range(6)]

In [4]:
df["id"] = range(df.shape[0])

In [5]:
df.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,class_vals,id
0,0 0.079106 1 0.079106 2 -0.903497 3...,0 0.394032 1 0.394032 2 -3.666397 3...,0 0.551444 1 0.551444 2 -0.282844 3...,0 0.351565 1 0.351565 2 -0.095881 3...,0 0.023970 1 0.023970 2 -0.319605 3...,0 0.633883 1 0.633883 2 0.972131 3...,standing,0
1,0 0.377751 1 0.377751 2 2.952965 3...,0 -0.610850 1 -0.610850 2 0.970717 3...,0 -0.147376 1 -0.147376 2 -5.962515 3...,0 -0.103872 1 -0.103872 2 -7.593275 3...,0 -0.109198 1 -0.109198 2 -0.697804 3...,0 -0.037287 1 -0.037287 2 -2.865789 3...,standing,1
2,0 -0.813905 1 -0.813905 2 -0.424628 3...,0 0.825666 1 0.825666 2 -1.305033 3...,0 0.032712 1 0.032712 2 0.826170 3...,0 0.021307 1 0.021307 2 -0.372872 3...,0 0.122515 1 0.122515 2 -0.045277 3...,0 0.775041 1 0.775041 2 0.383526 3...,standing,2
3,0 0.289855 1 0.289855 2 -0.669185 3...,0 0.284130 1 0.284130 2 -0.210466 3...,0 0.213680 1 0.213680 2 0.252267 3...,0 -0.314278 1 -0.314278 2 0.018644 3...,0 0.074574 1 0.074574 2 0.007990 3...,0 -0.079901 1 -0.079901 2 0.237040 3...,standing,3
4,0 -0.123238 1 -0.123238 2 -0.249547 3...,0 0.379341 1 0.379341 2 0.541501 3...,0 -0.286006 1 -0.286006 2 0.208420 3...,0 -0.098545 1 -0.098545 2 -0.023970 3...,0 0.058594 1 0.058594 2 0.175783 3...,0 -0.074574 1 -0.074574 2 0.114525 3...,standing,4


In [6]:
tsp = TimeSeriesClassificationPreprocessor(
    input_columns=input_columns,
    label_column=label_column,
    scaling=True,
)

tsp.train(df)

TimeSeriesClassificationPreprocessor {
  "_is_nested": true,
  "categorical_encoder": null,
  "context_length": 64,
  "encode_categorical": true,
  "feature_extractor_type": "TimeSeriesClassificationPreprocessor",
  "freq": null,
  "id_columns": [],
  "input_columns": [
    "dim_0",
    "dim_1",
    "dim_2",
    "dim_3",
    "dim_4",
    "dim_5"
  ],
  "label_column": "class_vals",
  "label_encoder": {
    "classes_": [
      "badminton",
      "running",
      "standing",
      "walking"
    ]
  },
  "processor_class": "TimeSeriesClassificationPreprocessor",
  "scale_categorical_columns": true,
  "scaler_dict": {
    "0": {
      "copy": true,
      "feature_names_in_": [
        "dim_0",
        "dim_1",
        "dim_2",
        "dim_3",
        "dim_4",
        "dim_5"
      ],
      "mean_": [
        2.552759629,
        -1.3039367667500004,
        -1.02658049575,
        0.01905110399999999,
        -0.023957696500000004,
        -0.055789663749999996
      ],
      "n_features_

In [7]:
df_prep = tsp.preprocess(df)
df_prep.head(20)

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,class_vals,id
0,0 -0.349766 1 -0.349766 2 -0.488703 3...,0 0.249919 1 0.249919 2 -0.347723 3...,0 0.444969 1 0.444969 2 0.209717 3...,0 0.157446 1 0.157446 2 -0.054421 3...,0 0.026323 1 0.026323 2 -0.162377 3...,0 0.196120 1 0.196120 2 0.292306 3...,2,0
1,0 -0.307539 1 -0.307539 2 0.056588 3...,0 0.102013 1 0.102013 2 0.334799 3...,0 0.247916 1 0.247916 2 -1.391826 3...,0 -0.058204 1 -0.058204 2 -3.604457 3...,0 -0.046816 1 -0.046816 2 -0.370093 3...,0 0.005262 1 0.005262 2 -0.799070 3...,2,1
2,0 -0.476035 1 -0.476035 2 -0.420992 3...,0 0.313449 1 0.313449 2 -0.000161 3...,0 0.298697 1 0.298697 2 0.522435 3...,0 0.001068 1 0.001068 2 -0.185577 3...,0 0.080446 1 0.080446 2 -0.011709 3...,0 0.236261 1 0.236261 2 0.124927 3...,2,2
3,0 -0.319967 1 -0.319967 2 -0.455572 3...,0 0.233742 1 0.233742 2 0.160944 3...,0 0.349726 1 0.349726 2 0.360607 3...,0 -0.157832 1 -0.157832 2 -0.000193 3...,0 0.054116 1 0.054116 2 0.017546 3...,0 -0.006856 1 -0.006856 2 0.083271 3...,2,3
4,0 -0.378377 1 -0.378377 2 -0.396237 3...,0 0.247756 1 0.247756 2 0.271624 3...,0 0.208826 1 0.208826 2 0.348243 3...,0 -0.055682 1 -0.055682 2 -0.020371 3...,0 0.045339 1 0.045339 2 0.109702 3...,0 -0.005342 1 -0.005342 2 0.048432 3...,2,4
5,0 -0.411473 1 -0.411473 2 -0.361666 3...,0 0.105835 1 0.105835 2 0.235348 3...,0 0.065935 1 0.065935 2 0.349722 3...,0 0.026290 1 0.026290 2 -0.083426 3...,0 0.100925 1 0.100925 2 -0.155063 3...,0 0.022681 1 0.022681 2 0.365772 3...,2,5
6,0 -0.186176 1 -0.186176 2 -0.344252 3...,0 0.108095 1 0.108095 2 0.230886 3...,0 0.722799 1 0.722799 2 0.490423 3...,0 0.059079 1 0.059079 2 -0.964946 3...,0 0.046803 1 0.046803 2 -0.048279 3...,0 0.273372 1 0.273372 2 -0.265879 3...,2,6
7,0 -0.412760 1 -0.412760 2 -0.343018 3...,0 0.240684 1 0.240684 2 0.347964 3...,0 0.058859 1 0.058859 2 0.370038 3...,0 -0.088471 1 -0.088471 2 0.071691 3...,0 -0.038039 1 -0.038039 2 -0.051204 3...,0 0.060550 1 0.060550 2 0.220356 3...,2,7
8,0 -0.409342 1 -0.409342 2 -0.403164 3...,0 0.240113 1 0.240113 2 0.114332 3...,0 0.333808 1 0.333808 2 0.359569 3...,0 0.177624 1 0.177624 2 -0.026676 3...,0 0.054116 1 0.054116 2 -0.035114 3...,0 0.005262 1 0.005262 2 0.001475 3...,2,8
9,0 -0.418559 1 -0.418559 2 -0.027940 3...,0 0.399952 1 0.399952 2 -0.386232 3...,0 0.315636 1 0.315636 2 0.229801 3...,0 -0.040548 1 -0.040548 2 -1.727920 3...,0 0.136032 1 0.136032 2 -0.001470 3...,0 0.054491 1 0.054491 2 -0.537776 3...,2,9


In [8]:
from tsfm_public.toolkit.time_series_classification_preprocessor import nest_transform, unnest_transform


u = unnest_transform(df, columns=input_columns)
n = nest_transform(u, columns=input_columns)

In [9]:
df.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,class_vals,id
0,0 0.079106 1 0.079106 2 -0.903497 3...,0 0.394032 1 0.394032 2 -3.666397 3...,0 0.551444 1 0.551444 2 -0.282844 3...,0 0.351565 1 0.351565 2 -0.095881 3...,0 0.023970 1 0.023970 2 -0.319605 3...,0 0.633883 1 0.633883 2 0.972131 3...,standing,0
1,0 0.377751 1 0.377751 2 2.952965 3...,0 -0.610850 1 -0.610850 2 0.970717 3...,0 -0.147376 1 -0.147376 2 -5.962515 3...,0 -0.103872 1 -0.103872 2 -7.593275 3...,0 -0.109198 1 -0.109198 2 -0.697804 3...,0 -0.037287 1 -0.037287 2 -2.865789 3...,standing,1
2,0 -0.813905 1 -0.813905 2 -0.424628 3...,0 0.825666 1 0.825666 2 -1.305033 3...,0 0.032712 1 0.032712 2 0.826170 3...,0 0.021307 1 0.021307 2 -0.372872 3...,0 0.122515 1 0.122515 2 -0.045277 3...,0 0.775041 1 0.775041 2 0.383526 3...,standing,2
3,0 0.289855 1 0.289855 2 -0.669185 3...,0 0.284130 1 0.284130 2 -0.210466 3...,0 0.213680 1 0.213680 2 0.252267 3...,0 -0.314278 1 -0.314278 2 0.018644 3...,0 0.074574 1 0.074574 2 0.007990 3...,0 -0.079901 1 -0.079901 2 0.237040 3...,standing,3
4,0 -0.123238 1 -0.123238 2 -0.249547 3...,0 0.379341 1 0.379341 2 0.541501 3...,0 -0.286006 1 -0.286006 2 0.208420 3...,0 -0.098545 1 -0.098545 2 -0.023970 3...,0 0.058594 1 0.058594 2 0.175783 3...,0 -0.074574 1 -0.074574 2 0.114525 3...,standing,4


In [10]:
u.head()

Unnamed: 0,__nested_series_id,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
0,0,0.079106,0.394032,0.551444,0.351565,0.02397,0.633883
1,0,0.079106,0.394032,0.551444,0.351565,0.02397,0.633883
2,0,-0.903497,-3.666397,-0.282844,-0.095881,-0.319605,0.972131
3,0,1.116125,-0.656101,0.333118,1.624657,-0.569962,1.209171
4,0,1.6382,1.405135,0.393875,1.187864,-0.271664,1.739182


In [12]:
u[u.__nested_series_id == 2].head()

Unnamed: 0,__nested_series_id,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
200,2,-0.813905,0.825666,0.032712,0.021307,0.122515,0.775041
201,2,-0.813905,0.825666,0.032712,0.021307,0.122515,0.775041
202,2,-0.424628,-1.305033,0.82617,-0.372872,-0.045277,0.383526
203,2,0.316895,-0.507693,0.218569,0.02397,-0.130505,0.588605
204,2,0.22858,0.028821,0.586313,0.066584,-0.263674,0.817655


In [13]:
n.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
0,0 0.079106 1 0.079106 2 -0.903497 3...,0 0.394032 1 0.394032 2 -3.666397 3...,0 0.551444 1 0.551444 2 -0.282844 3...,0 0.351565 1 0.351565 2 -0.095881 3...,0 0.023970 1 0.023970 2 -0.319605 3...,0 0.633883 1 0.633883 2 0.972131 3...
1,0 0.377751 1 0.377751 2 2.952965 3...,0 -0.610850 1 -0.610850 2 0.970717 3...,0 -0.147376 1 -0.147376 2 -5.962515 3...,0 -0.103872 1 -0.103872 2 -7.593275 3...,0 -0.109198 1 -0.109198 2 -0.697804 3...,0 -0.037287 1 -0.037287 2 -2.865789 3...
2,0 -0.813905 1 -0.813905 2 -0.424628 3...,0 0.825666 1 0.825666 2 -1.305033 3...,0 0.032712 1 0.032712 2 0.826170 3...,0 0.021307 1 0.021307 2 -0.372872 3...,0 0.122515 1 0.122515 2 -0.045277 3...,0 0.775041 1 0.775041 2 0.383526 3...
3,0 0.289855 1 0.289855 2 -0.669185 3...,0 0.284130 1 0.284130 2 -0.210466 3...,0 0.213680 1 0.213680 2 0.252267 3...,0 -0.314278 1 -0.314278 2 0.018644 3...,0 0.074574 1 0.074574 2 0.007990 3...,0 -0.079901 1 -0.079901 2 0.237040 3...
4,0 -0.123238 1 -0.123238 2 -0.249547 3...,0 0.379341 1 0.379341 2 0.541501 3...,0 -0.286006 1 -0.286006 2 0.208420 3...,0 -0.098545 1 -0.098545 2 -0.023970 3...,0 0.058594 1 0.058594 2 0.175783 3...,0 -0.074574 1 -0.074574 2 0.114525 3...


In [9]:
i = 2
df_prep.iloc[i].dim_0 * tsp.scaler_dict["0"].scale_[0] + tsp.scaler_dict["0"].mean_[0]

200   -0.813905
201   -0.813905
202   -0.424628
203    0.316895
204    0.228580
         ...   
295   -0.255364
296   -0.066292
297   -0.206440
298   -0.544255
299   -0.544255
Name: dim_0, Length: 100, dtype: float64

In [10]:
df.dim_0.iloc[2]

0    -0.813905
1    -0.813905
2    -0.424628
3     0.316895
4     0.228580
        ...   
95   -0.255364
96   -0.066292
97   -0.206440
98   -0.544255
99   -0.544255
Length: 100, dtype: float64

In [71]:
df.iloc[i].dim_0

0    -0.813905
1    -0.813905
2    -0.424628
3     0.316895
4     0.228580
        ...   
95   -0.255364
96   -0.066292
97   -0.206440
98   -0.544255
99   -0.544255
Length: 100, dtype: float64

In [19]:
for dim in range(6):
    print(f"DIM {dim}")
    for i in range(df_prep.shape[0]):
        untransformed_prep = (
            df_prep[f"dim_{dim}"].iloc[i] * tsp.scaler_dict["0"].scale_[dim] + tsp.scaler_dict["0"].mean_[dim]
        )
        assert np.mean(np.abs(df[f"dim_{dim}"].iloc[i].values - untransformed_prep.values)) < 1e-6

DIM 0
DIM 1
DIM 2
DIM 3
DIM 4
DIM 5


In [20]:
df_prep_inv = tsp.inverse_transform_labels(df_prep)
all(df_prep_inv.class_vals == df.class_vals)

True

In [None]:
# Full_series mode is enabled in this case because the data is in nested format
# Data is interpolated to fit the desired context_length (4000 interpolated to 128)

# Comment (Wes): I am thinking of removing the "full_series" option and autodetecting if it is a nested series

dset = ClassificationDFDataset(
    df_prep,
    id_columns=[],
    timestamp_column=None,
    input_columns=input_columns,
    label_column=label_column,
    context_length=512,
    static_categorical_columns=[],
    stride=1,
    enable_padding=False,
    full_series=True,
)

In [8]:
dset[0]

{'past_values': tensor([[-3.4977e-01,  2.4992e-01,  4.4497e-01,  1.5745e-01,  2.6323e-02,
           1.9612e-01],
         [-3.4977e-01,  2.4992e-01,  4.4497e-01,  1.5745e-01,  2.6323e-02,
           1.9612e-01],
         [-4.1272e-01, -2.0888e-02,  3.3837e-01,  6.1444e-02, -5.9181e-02,
           2.3970e-01],
         [-4.2177e-01, -2.4388e-01,  2.5043e-01,  1.3652e-01, -1.9460e-01,
           3.0810e-01],
         [-2.0198e-01,  1.0009e-01,  3.8367e-01,  7.5703e-01, -2.9732e-01,
           3.6207e-01],
         [-1.4431e-01,  3.3711e-01,  3.9706e-01,  5.9545e-01, -1.6932e-01,
           4.7982e-01],
         [-1.8120e-01,  4.6812e-01,  3.4134e-01,  5.0313e-01, -6.5010e-02,
           4.7102e-01],
         [-2.6859e-01,  5.7314e-01,  2.6326e-01,  3.5357e-01,  2.8357e-02,
           3.3285e-01],
         [-3.5686e-01,  6.6537e-01,  1.5079e-01,  1.0895e-01,  8.8812e-02,
           1.0936e-01],
         [-3.5672e-01,  6.3914e-01, -1.2851e-01, -1.3243e-01,  1.3387e-02,
          -4.8619e-

## Case 2: Data is in canonical format
We first convert the original data into canonical format.

In [9]:
from tsfm_public.toolkit.time_series_classification_preprocessor import unnest_transform

In [20]:
df2 = unnest_transform(df, columns=input_columns)
labels = df[label_column].reset_index()
labels["index"] = labels["index"].astype(str)
df2 = df2.merge(labels, left_on="__nested_series_id", right_on="index")

In [21]:
df2.head()

Unnamed: 0,__nested_series_id,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,index,class_vals
0,0,0.079106,0.394032,0.551444,0.351565,0.02397,0.633883,0,standing
1,0,0.079106,0.394032,0.551444,0.351565,0.02397,0.633883,0,standing
2,0,-0.903497,-3.666397,-0.282844,-0.095881,-0.319605,0.972131,0,standing
3,0,1.116125,-0.656101,0.333118,1.624657,-0.569962,1.209171,0,standing
4,0,1.6382,1.405135,0.393875,1.187864,-0.271664,1.739182,0,standing


In [22]:
tsp2 = TimeSeriesClassificationPreprocessor(
    input_columns=input_columns,
    label_column=label_column,
    scaling=True,
)

tsp2.train(df2)

TimeSeriesClassificationPreprocessor {
  "_is_nested": false,
  "categorical_encoder": null,
  "context_length": 64,
  "encode_categorical": true,
  "feature_extractor_type": "TimeSeriesClassificationPreprocessor",
  "freq": null,
  "id_columns": [],
  "input_columns": [
    "dim_0",
    "dim_1",
    "dim_2",
    "dim_3",
    "dim_4",
    "dim_5"
  ],
  "label_column": "class_vals",
  "label_encoder": {
    "classes_": [
      "badminton",
      "running",
      "standing",
      "walking"
    ]
  },
  "processor_class": "TimeSeriesClassificationPreprocessor",
  "scale_categorical_columns": true,
  "scaler_dict": {
    "0": {
      "copy": true,
      "feature_names_in_": [
        "dim_0",
        "dim_1",
        "dim_2",
        "dim_3",
        "dim_4",
        "dim_5"
      ],
      "mean_": [
        2.552759629,
        -1.3039367667500004,
        -1.02658049575,
        0.01905110399999999,
        -0.023957696500000004,
        -0.055789663749999996
      ],
      "n_features

In [24]:
df2_prep = tsp2.preprocess(df2)
df2_prep.head()

Unnamed: 0,__nested_series_id,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,index,class_vals
0,0,-0.349766,0.249919,0.444969,0.157446,0.026323,0.19612,0,2
1,0,-0.349766,0.249919,0.444969,0.157446,0.026323,0.19612,0,2
2,0,-0.488703,-0.347723,0.209717,-0.054421,-0.162377,0.292306,0,2
3,0,-0.203135,0.095353,0.383405,0.760259,-0.299879,0.359713,0,2
4,0,-0.129316,0.39874,0.400537,0.553436,-0.136046,0.51043,0,2


In [None]:
# Full_series mode is not available in this case, because the data is in canonical format
# Data is not interpolated to fit the desired context_length

dset2 = ClassificationDFDataset(
    df2_prep,
    id_columns=[],
    timestamp_column=None,
    input_columns=input_columns,
    label_column=label_column,
    context_length=128,
    static_categorical_columns=[],
    stride=1,
    enable_padding=False,
    full_series=False,
)

In [27]:
dset2[0]

{'past_values': tensor([[-3.4977e-01,  2.4992e-01,  4.4497e-01,  1.5745e-01,  2.6323e-02,
           1.9612e-01],
         [-3.4977e-01,  2.4992e-01,  4.4497e-01,  1.5745e-01,  2.6323e-02,
           1.9612e-01],
         [-4.8870e-01, -3.4772e-01,  2.0972e-01, -5.4421e-02, -1.6238e-01,
           2.9231e-01],
         [-2.0314e-01,  9.5353e-02,  3.8341e-01,  7.6026e-01, -2.9988e-01,
           3.5971e-01],
         [-1.2932e-01,  3.9874e-01,  4.0054e-01,  5.5344e-01, -1.3605e-01,
           5.1043e-01],
         [-2.1907e-01,  5.1875e-01,  2.9815e-01,  4.6642e-01, -1.3172e-02,
           4.4227e-01],
         [-3.5688e-01,  6.7009e-01,  2.0106e-01,  1.5240e-01,  1.0239e-01,
           1.3780e-01],
         [-3.5670e-01,  6.3652e-01, -1.5644e-01, -1.5657e-01,  5.8441e-03,
          -6.4417e-02],
         [-3.7799e-01,  4.7998e-01, -5.5974e-03, -1.7423e-01, -6.5832e-02,
          -2.4770e-01],
         [-3.7799e-01,  4.7998e-01, -5.5974e-03, -1.7423e-01, -6.5832e-02,
          -2.4770e-

In [29]:
dset2[0].keys()

dict_keys(['past_values', 'target_values', 'past_observed_mask', 'id'])

In [30]:
dset[0].keys()

dict_keys(['past_values', 'target_values', 'id'])