# Tsfresh (Time Series Feature Extractor)

The idea of this notebook is providing a quick way to obtain strong baselines with a library to automatically extract time-series features


From time-series data like:
<img src="images/introduction_ts_exa.png" width="400" height="200">

Extracts features like:
<img src="images/introduction_ts_exa_features.png" width="400" height="200">

- Relativelly simple to use
- Extract +700 features with few lines of code (statistical, trends, wavelets, fourier transformations...)
- Feature prunning (if needed) with Benjamini-Yekutieli procedure.

#### Important but not covered here
Domain specific features, like HRV:
 - PyHRV: https://pyhrv.readthedocs.io/en/latest/
 - HRV-Analysis https://aura-healthcare.github.io/hrvanalysis/

In [1]:
import pandas as pd
import numpy as np
from glob import glob
import tsfresh
import os
from tsfresh.feature_extraction import extract_features
from tsfresh.feature_extraction import MinimalFCParameters, ComprehensiveFCParameters, EfficientFCParameters
from tsfresh.utilities.dataframe_functions import make_forecasting_frame, roll_time_series


  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [2]:
df = pd.read_csv("./datasets/tutorial_sleep_training_data.csv.gz")
# Removing Nan values from HR
df = df.dropna()

print("Original Dataframe has %d rows" % df.shape[0])

Original Dataframe has 102859 rows


In [3]:
df_small = df[df["pid"].isin([1, 16])].dropna()
df_small

Unnamed: 0,time,act,sleep_phase,hr,pid
38399,29,1.0,0.0,66.0,1
38429,59,0.0,0.0,67.0,1
38459,89,0.0,1.0,66.0,1
38489,119,1.0,1.0,67.0,1
38519,149,1.0,1.0,64.0,1
...,...,...,...,...,...
499958,15869,4.0,1.0,95.0,16
499988,15899,22.0,0.0,61.0,16
500018,15929,15.0,0.0,73.0,16
500048,15959,16.0,1.0,81.0,16


In [4]:
df_extracted_features = tsfresh.extract_features(df_small[["time", "pid", "act"]],
                                                 column_id="pid", 
                                                 column_sort="time", show_warnings=False)

Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.99s/it]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [5]:
df_extracted_features

Unnamed: 0,act__variance_larger_than_standard_deviation,act__has_duplicate_max,act__has_duplicate_min,act__has_duplicate,act__sum_values,act__abs_energy,act__mean_abs_change,act__mean_change,act__mean_second_derivative_central,act__median,...,act__fourier_entropy__bins_2,act__fourier_entropy__bins_3,act__fourier_entropy__bins_5,act__fourier_entropy__bins_10,act__fourier_entropy__bins_100,act__permutation_entropy__dimension_3__tau_1,act__permutation_entropy__dimension_4__tau_1,act__permutation_entropy__dimension_5__tau_1,act__permutation_entropy__dimension_6__tau_1,act__permutation_entropy__dimension_7__tau_1
1,1.0,0.0,1.0,1.0,4008.0,707198.0,6.735931,0.155844,0.055255,0.0,...,0.417974,0.779345,1.249572,1.922716,3.85408,1.351787,2.226125,3.107142,3.938264,4.690248
16,1.0,0.0,1.0,1.0,4629.0,500191.0,12.530075,0.0,-0.015066,1.0,...,0.326783,0.61795,1.123223,1.747968,3.723715,1.571942,2.676353,3.842873,4.790484,5.456311


In [6]:
df_extracted_features.keys()

Index(['act__variance_larger_than_standard_deviation',
       'act__has_duplicate_max', 'act__has_duplicate_min',
       'act__has_duplicate', 'act__sum_values', 'act__abs_energy',
       'act__mean_abs_change', 'act__mean_change',
       'act__mean_second_derivative_central', 'act__median',
       ...
       'act__fourier_entropy__bins_2', 'act__fourier_entropy__bins_3',
       'act__fourier_entropy__bins_5', 'act__fourier_entropy__bins_10',
       'act__fourier_entropy__bins_100',
       'act__permutation_entropy__dimension_3__tau_1',
       'act__permutation_entropy__dimension_4__tau_1',
       'act__permutation_entropy__dimension_5__tau_1',
       'act__permutation_entropy__dimension_6__tau_1',
       'act__permutation_entropy__dimension_7__tau_1'],
      dtype='object', length=779)

In [7]:
df_rolled = roll_time_series(df_small[["time", "hr", "act", "pid", "sleep_phase"]], 
                             column_id="pid", 
                             min_timeshift=0,  # it actually uses this number +1
                             max_timeshift=2,  # it also uses this number +1
                             n_jobs=3, show_warnings=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sort"] = range(df.shape[0])
Rolling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 13.80it/s]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [8]:
df_rolled.head(15)

Unnamed: 0,time,hr,act,pid,sleep_phase,sort,id
372,29,66.0,1.0,1,0.0,0,"(1, 0)"
374,29,66.0,1.0,1,0.0,0,"(1, 1)"
375,59,67.0,0.0,1,0.0,1,"(1, 1)"
378,29,66.0,1.0,1,0.0,0,"(1, 2)"
379,59,67.0,0.0,1,0.0,1,"(1, 2)"
380,89,66.0,0.0,1,1.0,2,"(1, 2)"
384,59,67.0,0.0,1,0.0,1,"(1, 3)"
385,89,66.0,0.0,1,1.0,2,"(1, 3)"
386,119,67.0,1.0,1,1.0,3,"(1, 3)"
390,89,66.0,0.0,1,1.0,2,"(1, 4)"


In [9]:
# What win size should we use? 11 = 5 minutes and 30 seconds
df_rolled = roll_time_series(df_small[["time", "hr", "act", "pid", "sleep_phase"]], 
                             column_id="pid", 
                             min_timeshift=0, 
                             max_timeshift=10,
                             n_jobs=3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sort"] = range(df.shape[0])
Rolling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 13.78it/s]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [10]:
df_rolled.tail(15)

Unnamed: 0,time,hr,act,pid,sleep_phase,sort,id
10226,15869,95.0,4.0,16,1.0,1453,"(16, 531)"
10227,15899,61.0,22.0,16,0.0,1454,"(16, 531)"
10228,15929,73.0,15.0,16,0.0,1455,"(16, 531)"
10229,15959,81.0,16.0,16,1.0,1456,"(16, 531)"
10241,15689,50.0,18.0,16,2.0,1447,"(16, 532)"
10242,15719,76.0,0.0,16,1.0,1448,"(16, 532)"
10243,15749,80.0,6.0,16,1.0,1449,"(16, 532)"
10244,15779,55.0,10.0,16,1.0,1450,"(16, 532)"
10245,15809,54.0,8.0,16,0.0,1451,"(16, 532)"
10246,15839,74.0,0.0,16,1.0,1452,"(16, 532)"


In [11]:
df_extracted_features = tsfresh.extract_features(df_rolled[["id", "time", "act", "hr"]], 
                                                 column_id="id",
                                                 column_sort="time",
                                                 #
                                                 # In order to be quick, we are just extracting the Minimal 
                                                 # Parameters (Min, max, mean, std...)
                                                 default_fc_parameters=MinimalFCParameters(),
                                                 n_jobs=3)

Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 15.87it/s]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [12]:
df_extracted_features.tail()

Unnamed: 0,Unnamed: 1,act__sum_values,act__median,act__mean,act__length,act__standard_deviation,act__variance,act__maximum,act__minimum,hr__sum_values,hr__median,hr__mean,hr__length,hr__standard_deviation,hr__variance,hr__maximum,hr__minimum
16,528,131.0,8.0,11.909091,11.0,18.461038,340.809917,68.0,0.0,754.0,74.0,68.545455,11.0,15.26948,233.157025,95.0,50.0
16,529,144.0,8.0,13.090909,11.0,18.6521,347.900826,68.0,0.0,764.0,74.0,69.454545,11.0,14.474829,209.520661,95.0,50.0
16,530,159.0,8.0,14.454545,11.0,18.187726,330.793388,68.0,0.0,755.0,73.0,68.636364,11.0,13.988779,195.68595,95.0,50.0
16,531,107.0,8.0,9.727273,11.0,6.929396,48.016529,22.0,0.0,753.0,73.0,68.454545,11.0,13.812798,190.793388,95.0,50.0
16,532,99.0,8.0,9.0,11.0,7.471157,55.818182,22.0,0.0,767.0,73.0,69.727273,11.0,13.046009,170.198347,95.0,50.0


Note that the index used in df_extracted_features are the "pid" and "time".

We can get this index back to our dataframe with `df.reset_index()`

In [13]:
df_features = df_extracted_features.reset_index().rename(columns={"level_0":"pid", "level_1": "time"})
df_features

Unnamed: 0,pid,time,act__sum_values,act__median,act__mean,act__length,act__standard_deviation,act__variance,act__maximum,act__minimum,hr__sum_values,hr__median,hr__mean,hr__length,hr__standard_deviation,hr__variance,hr__maximum,hr__minimum
0,1,0,1.0,1.0,1.000000,1.0,0.000000,0.000000,1.0,1.0,66.0,66.0,66.000000,1.0,0.000000,0.000000,66.0,66.0
1,1,1,1.0,0.5,0.500000,2.0,0.500000,0.250000,1.0,0.0,133.0,66.5,66.500000,2.0,0.500000,0.250000,67.0,66.0
2,1,2,1.0,0.0,0.333333,3.0,0.471405,0.222222,1.0,0.0,199.0,66.0,66.333333,3.0,0.471405,0.222222,67.0,66.0
3,1,3,2.0,0.5,0.500000,4.0,0.500000,0.250000,1.0,0.0,266.0,66.5,66.500000,4.0,0.500000,0.250000,67.0,66.0
4,1,4,3.0,1.0,0.600000,5.0,0.489898,0.240000,1.0,0.0,330.0,66.0,66.000000,5.0,1.095445,1.200000,67.0,64.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,16,528,131.0,8.0,11.909091,11.0,18.461038,340.809917,68.0,0.0,754.0,74.0,68.545455,11.0,15.269480,233.157025,95.0,50.0
1454,16,529,144.0,8.0,13.090909,11.0,18.652100,347.900826,68.0,0.0,764.0,74.0,69.454545,11.0,14.474829,209.520661,95.0,50.0
1455,16,530,159.0,8.0,14.454545,11.0,18.187726,330.793388,68.0,0.0,755.0,73.0,68.636364,11.0,13.988779,195.685950,95.0,50.0
1456,16,531,107.0,8.0,9.727273,11.0,6.929396,48.016529,22.0,0.0,753.0,73.0,68.454545,11.0,13.812798,190.793388,95.0,50.0


In [13]:
df_features[df_features["pid"] == 16]

Unnamed: 0,pid,time,act__sum_values,act__median,act__mean,act__length,act__standard_deviation,act__variance,act__maximum,act__minimum,hr__sum_values,hr__median,hr__mean,hr__length,hr__standard_deviation,hr__variance,hr__maximum,hr__minimum
925,16,0,0.0,0.0,0.000000,1.0,0.000000,0.000000,0.0,0.0,48.0,48.0,48.000000,1.0,0.000000,0.000000,48.0,48.0
926,16,1,0.0,0.0,0.000000,2.0,0.000000,0.000000,0.0,0.0,100.0,50.0,50.000000,2.0,2.000000,4.000000,52.0,48.0
927,16,2,0.0,0.0,0.000000,3.0,0.000000,0.000000,0.0,0.0,156.0,52.0,52.000000,3.0,3.265986,10.666667,56.0,48.0
928,16,3,1.0,0.0,0.250000,4.0,0.433013,0.187500,1.0,0.0,206.0,51.0,51.500000,4.0,2.958040,8.750000,56.0,48.0
929,16,4,1.0,0.0,0.200000,5.0,0.400000,0.160000,1.0,0.0,255.0,50.0,51.000000,5.0,2.828427,8.000000,56.0,48.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1453,16,528,131.0,8.0,11.909091,11.0,18.461038,340.809917,68.0,0.0,754.0,74.0,68.545455,11.0,15.269480,233.157025,95.0,50.0
1454,16,529,144.0,8.0,13.090909,11.0,18.652100,347.900826,68.0,0.0,764.0,74.0,69.454545,11.0,14.474829,209.520661,95.0,50.0
1455,16,530,159.0,8.0,14.454545,11.0,18.187726,330.793388,68.0,0.0,755.0,73.0,68.636364,11.0,13.988779,195.685950,95.0,50.0
1456,16,531,107.0,8.0,9.727273,11.0,6.929396,48.016529,22.0,0.0,753.0,73.0,68.454545,11.0,13.812798,190.793388,95.0,50.0


In [14]:
# Do we have the same number of rows?
df_small[df_small["pid"] == 16][["sleep_phase"]]

Unnamed: 0,sleep_phase
484118,2.0
484148,2.0
484178,2.0
484208,2.0
484238,2.0
...,...
499958,1.0
499988,0.0
500018,0.0
500048,1.0


# Let's do it for the whole dataset

Here I am dropping NAs and simplifying the use of HR

In [15]:
WINSIZE = 11
df = df.dropna()

In [18]:
print(f"Before, our dataframe had {df.shape[0]}. Now the new dataframe has {df_rolled.shape[0]} rows.")
df_rolled.head()

Before, our dataframe had 102859. Now the new dataframe has 1125949 rows.


Unnamed: 0,time,hr,act,pid,sleep_phase,sort,id
0,29,71.0,2.0,0,0.0,0,"(0, 0)"
100,29,71.0,2.0,0,0.0,0,"(0, 1)"
101,59,76.0,0.0,0,0.0,1,"(0, 1)"
300,29,71.0,2.0,0,0.0,0,"(0, 2)"
301,59,76.0,0.0,0,0.0,1,"(0, 2)"


<hr>

# The first feature set - TSFresh Features 

Now lets get more advanced features using TSFresh

In [19]:
%%time 
# Let's start experimenting with a few subjects, before moving to the whole dataset:

# df_small has only 3 subjects
df_small = df[df["pid"].isin(df["pid"].unique()[:3])].dropna()



# Roll the dataset
df_rolled = roll_time_series(df_small[["time", "hr", "act", "pid", "sleep_phase"]].copy(), 
                             column_id="pid", 
                             min_timeshift=0,
                             max_timeshift=WINSIZE-1,
                             n_jobs=5)


# Get the "subset" of features that are "efficient" to calculate
settings = EfficientFCParameters()
del settings["friedrich_coefficients"]
del settings["max_langevin_fixed_point"]

df_extracted_features = tsfresh.extract_features(df_rolled[["id", "sort", "act", "hr"]], 
                                                 column_id="id",
                                                 column_sort="sort",
                                                 default_fc_parameters=settings,
                                                 n_jobs=5)


Rolling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:01<00:00, 21.55it/s]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
Feature Extraction: 100%|████████████████████████████████████████████████████████████

CPU times: user 4.3 s, sys: 962 ms, total: 5.26 s
Wall time: 20.3 s


In [20]:
df_extracted_features.tail()

Unnamed: 0,Unnamed: 1,act__variance_larger_than_standard_deviation,act__has_duplicate_max,act__has_duplicate_min,act__has_duplicate,act__sum_values,act__abs_energy,act__mean_abs_change,act__mean_change,act__mean_second_derivative_central,act__median,...,hr__fourier_entropy__bins_2,hr__fourier_entropy__bins_3,hr__fourier_entropy__bins_5,hr__fourier_entropy__bins_10,hr__fourier_entropy__bins_100,hr__permutation_entropy__dimension_3__tau_1,hr__permutation_entropy__dimension_4__tau_1,hr__permutation_entropy__dimension_5__tau_1,hr__permutation_entropy__dimension_6__tau_1,hr__permutation_entropy__dimension_7__tau_1
2,1070,1.0,0.0,1.0,1.0,255.0,60567.0,49.4,0.0,0.0,0.0,...,0.450561,0.867563,0.867563,1.242453,1.791759,1.676988,1.906155,1.94591,1.791759,1.609438
2,1071,1.0,0.0,1.0,1.0,255.0,60567.0,49.4,0.0,0.0,0.0,...,0.636514,0.867563,0.867563,0.867563,1.791759,1.676988,1.906155,1.94591,1.791759,1.609438
2,1072,1.0,0.0,1.0,1.0,255.0,60567.0,49.4,0.0,-0.388889,0.0,...,0.636514,0.636514,0.636514,0.636514,1.329661,1.464816,1.906155,1.94591,1.791759,1.609438
2,1073,1.0,0.0,1.0,1.0,256.0,60568.0,48.8,-0.6,-13.222222,0.0,...,0.450561,0.867563,0.867563,1.242453,1.791759,1.676988,2.079442,1.94591,1.791759,1.609438
2,1074,1.0,0.0,1.0,1.0,251.0,60523.0,25.0,-24.4,13.722222,0.0,...,0.450561,0.867563,1.011404,1.56071,1.791759,1.735126,2.079442,1.94591,1.791759,1.609438


In [21]:
%%time

# WARNING: This cell might take very long to run. 
# Last time, it took me 20 minutes (but it can take much longer if you dont break the df into pieces)

# If you dony want to run this cell, you can get the generated dataset from my dropbox:
# https://www.dropbox.com/scl/fo/5llpuwwtcuo22p9jnfxuo/h?dl=0&rlkey=nm0kqrfbk3z9s8qns8hjh4437


# Roll the dataset
df_rolled = roll_time_series(df[["time", "hr", "act", "pid", "sleep_phase"]], 
                             column_id="pid", 
                             min_timeshift=0,
                             max_timeshift=WINSIZE-1,
                             n_jobs=7)


settings = EfficientFCParameters()
del settings["friedrich_coefficients"]
del settings["max_langevin_fixed_point"]

start = 0
steps = 10

for start in range(0, df["pid"].unique().shape[0] + 1, steps):
    idxs = range(start, start+steps)
    
    
    df_rolled_small = df_rolled[df_rolled["pid"].isin(idxs)]
    
    df_extracted_features = tsfresh.extract_features(df_rolled_small[["id", "sort", "act", "hr"]], 
                                                     column_id="id",
                                                     column_sort="sort",
                                                     default_fc_parameters=settings,
                                                     n_jobs=7)
    
    df_features = df_extracted_features.reset_index().rename(columns={"level_0":"pid", "level_1": "time"})
    df_features.to_csv(f"datasets/df_tsfresh_features_{start}.tar.gz", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sort"] = range(df.shape[0])
Rolling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:06<00:00,  5.43it/s]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period

Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:38<00:00,  1.10s/it]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas i

Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:42<00:00,  1.20s/it]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas i

Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:37<00:00,  1.07s/it]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas i

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:38<00:00,  1.10s/it]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas i

Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:38<00:00,  1.10s/it]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas i

ZeroDivisionError: division by zero

In [22]:
# Merge all data
dfs = []
for file in glob("datasets/df_tsfresh_features_*.tar.gz"):
    dfs.append(pd.read_csv(file))

df_features = pd.concat(dfs)

In [23]:
df_features = df_features.sort_values(by=["pid", "time"]).reset_index(drop=True)
df_features.head()

Unnamed: 0,pid,time,act__variance_larger_than_standard_deviation,act__has_duplicate_max,act__has_duplicate_min,act__has_duplicate,act__sum_values,act__abs_energy,act__mean_abs_change,act__mean_change,...,hr__fourier_entropy__bins_2,hr__fourier_entropy__bins_3,hr__fourier_entropy__bins_5,hr__fourier_entropy__bins_10,hr__fourier_entropy__bins_100,hr__permutation_entropy__dimension_3__tau_1,hr__permutation_entropy__dimension_4__tau_1,hr__permutation_entropy__dimension_5__tau_1,hr__permutation_entropy__dimension_6__tau_1,hr__permutation_entropy__dimension_7__tau_1
0,0,0,0.0,0.0,0.0,0.0,2.0,4.0,,,...,,,,,,,,,,
1,0,1,0.0,0.0,0.0,0.0,2.0,4.0,2.0,-2.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,,,,,
2,0,2,0.0,0.0,0.0,0.0,3.0,5.0,1.5,-0.5,...,0.693147,0.693147,0.693147,0.693147,0.693147,-0.0,,,,
3,0,3,0.0,1.0,0.0,1.0,5.0,9.0,1.333333,0.0,...,0.636514,0.636514,0.636514,0.636514,0.636514,0.693147,-0.0,,,
4,0,4,1.0,0.0,0.0,1.0,92.0,7578.0,22.25,21.25,...,0.636514,0.636514,0.636514,0.636514,1.098612,1.098612,0.693147,-0.0,,


In [24]:
# Reset time col
df_nonan = df.dropna().copy()
df_nonan["time"] = 1
df_nonan["time"] = df_nonan.groupby("pid")["time"].cumsum()


df_merged = pd.merge(df_nonan, df_features)
df_merged.to_csv("datasets/df_tsfresh_features.tar.gz", index=False)

In [25]:
# Clean dataset folder by removing temporary files
for file in glob("datasets/df_tsfresh_features_*.tar.gz"):
    if os.path.isfile(file):
        print("Removing...", file)
        os.remove(file)


Removing... datasets/df_tsfresh_features_60.tar.gz
Removing... datasets/df_tsfresh_features_70.tar.gz
Removing... datasets/df_tsfresh_features_50.tar.gz
Removing... datasets/df_tsfresh_features_40.tar.gz
Removing... datasets/df_tsfresh_features_10.tar.gz
Removing... datasets/df_tsfresh_features_80.tar.gz
Removing... datasets/df_tsfresh_features_90.tar.gz
Removing... datasets/df_tsfresh_features_0.tar.gz
Removing... datasets/df_tsfresh_features_20.tar.gz
Removing... datasets/df_tsfresh_features_30.tar.gz


## The second feature set from rolling windows: raw signals

In [26]:
df_rolled = roll_time_series(df[["time", "hr", "act", "pid", "sleep_phase"]], 
                             column_id="pid", 
                             min_timeshift=0, 
                             max_timeshift=WINSIZE-1,
                             n_jobs=5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sort"] = range(df.shape[0])
Rolling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:08<00:00,  2.88it/s]
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period

In [27]:
def make_cols(l, size):
    
    l = np.array(l)
    # print(len(l))
    
    # This will pad L with 0s at the end
    # l = np.pad(l, (0, size-len(l)),  "constant")
    
    # this will pad with 0s at the begining
    l = np.pad(l, (size-len(l), 0),  "constant")
    return l

def get_raw_win_features(df, winsize=WINSIZE):
    
    df_acts = df[["pid", "id", "act"]].groupby('id')["act"].apply(lambda x: make_cols(x, size=winsize))
    df_hrs = df[["pid", "id", "hr"]].groupby('id')["hr"].apply(lambda x: make_cols(x, size=winsize))
    
    df_acts = df_acts.apply(pd.Series).rename(columns=dict([(i, "act_%d" % i) for i in range(winsize)]))
    df_hrs = df_hrs.apply(pd.Series).rename(columns=dict([(i, "hr_%d" % i) for i in range(winsize)]))
    
    df_new = pd.concat((df_acts, df_hrs), axis=1).reset_index()
    
    # Get the sorted unique ids for this dataframe
    df_uniqueids_sorted = df[["pid", "id", "time", "sort", "sleep_phase"]].groupby(["pid", "time"]).first().reset_index()
    
    return pd.merge(df_uniqueids_sorted, df_new, on="id")


In [28]:
%%time
df_features_raw_win = get_raw_win_features(df_rolled)

print(f"The number of rows in the df now ({df_features_raw_win.shape[0]}) and in the original df ({df.shape[0]}) "
      "should be the same!")

The number of rows in the df now (102859) and in the original df (102859) should be the same!
CPU times: user 11.1 s, sys: 172 ms, total: 11.3 s
Wall time: 11.2 s


In [29]:
# Result:
df_features_raw_win.head()

Unnamed: 0,pid,time,id,sort,sleep_phase,act_0,act_1,act_2,act_3,act_4,...,hr_1,hr_2,hr_3,hr_4,hr_5,hr_6,hr_7,hr_8,hr_9,hr_10
0,0,29,"(0, 0)",0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0
1,0,59,"(0, 1)",1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,76.0
2,0,89,"(0, 2)",2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,76.0,78.0
3,0,119,"(0, 3)",3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,71.0,76.0,78.0,73.0
4,0,149,"(0, 4)",4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,71.0,76.0,78.0,73.0,80.0


In [30]:
# The most recent signal is at the last index (i.e., act_10, hr_10)
df_features_raw_win[f"act_{WINSIZE-1}"].head(20)

0      2.0
1      0.0
2      1.0
3      2.0
4     87.0
5      0.0
6      1.0
7      0.0
8      0.0
9      0.0
10     0.0
11     1.0
12     1.0
13    66.0
14     0.0
15     0.0
16     3.0
17     0.0
18     5.0
19    34.0
Name: act_10, dtype: float64

In [31]:
df.head(20)

Unnamed: 0,time,act,sleep_phase,hr,pid
29,29,2.0,0.0,71.0,0
59,59,0.0,0.0,76.0,0
89,89,1.0,0.0,78.0,0
119,119,2.0,0.0,73.0,0
149,149,87.0,0.0,80.0,0
179,179,0.0,0.0,75.0,0
209,209,1.0,0.0,77.0,0
239,239,0.0,0.0,78.0,0
269,269,0.0,0.0,78.0,0
299,299,0.0,0.0,78.0,0


In [32]:
df_features_raw_win.to_csv("./datasets/df_raw_features.tar.gz", index=False)

<b> Done with feature sets creation. Now we have two sets ready to be used by ML models </b>