# tsfresh

First prepare data to be in a format compatibile with tsfresh library.
Two approaches will be tested:
- Extracting features from raw signal (minute interval)
- Extracting features from aggregated signal (hour interval)
Result of this extraction will be a set of features for each patient.

In [1]:
import tsfresh as tsf

import os
from typing import Callable, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp

Q:\ProgramData\Miniconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
Q:\ProgramData\Miniconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll


In [2]:
# Load data
data_dir = "./depresjon_data"
condition_dir = os.path.join(data_dir, "condition")
control_dir = os.path.join(data_dir, "control")

conditions = [pd.read_csv(os.path.join(condition_dir, filename)) for filename in os.listdir(condition_dir)]
controls = [pd.read_csv(os.path.join(control_dir, filename)) for filename in os.listdir(control_dir)]

In [3]:
# 1st part - raw signal (minute interval)

def get_raw_dataframe():
    f_frames = []

    for c_i, c in enumerate(conditions, start=1):
        c_copy = c.copy()
        c_copy['number'] = f'condition_{c_i}'
        f_frames.append(c_copy)

    for c_i, c in enumerate(controls, start=1):
        c_copy = c.copy()
        c_copy['number'] = f'control_{c_i}'
        f_frames.append(c_copy)

    f_df = pd.concat(f_frames)
    f_df.reset_index(inplace=True, drop=True)
    f_df.drop(inplace=True, columns='date')
    return f_df

In [4]:
raw_df = get_raw_dataframe()
raw_df

Unnamed: 0,timestamp,activity,number
0,2003-05-07 12:00:00,0,condition_1
1,2003-05-07 12:01:00,143,condition_1
2,2003-05-07 12:02:00,0,condition_1
3,2003-05-07 12:03:00,20,condition_1
4,2003-05-07 12:04:00,166,condition_1
...,...,...,...
1571701,2003-12-01 12:53:00,7,control_32
1571702,2003-12-01 12:54:00,7,control_32
1571703,2003-12-01 12:55:00,5,control_32
1571704,2003-12-01 12:56:00,5,control_32


In [5]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571706 entries, 0 to 1571705
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   timestamp  1571706 non-null  object
 1   activity   1571706 non-null  int64 
 2   number     1571706 non-null  object
dtypes: int64(1), object(2)
memory usage: 36.0+ MB


In [6]:
raw_df['timestamp']= pd.to_datetime(raw_df['timestamp'])
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571706 entries, 0 to 1571705
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   timestamp  1571706 non-null  datetime64[ns]
 1   activity   1571706 non-null  int64         
 2   number     1571706 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 36.0+ MB


In [7]:
raw_df_grouped = list(raw_df.groupby('number'))

In [8]:
raw_df_grouped[0][1]

Unnamed: 0,timestamp,activity,number
0,2003-05-07 12:00:00,0,condition_1
1,2003-05-07 12:01:00,143,condition_1
2,2003-05-07 12:02:00,0,condition_1
3,2003-05-07 12:03:00,20,condition_1
4,2003-05-07 12:04:00,166,condition_1
...,...,...,...
23239,2003-05-23 15:19:00,0,condition_1
23240,2003-05-23 15:20:00,0,condition_1
23241,2003-05-23 15:21:00,0,condition_1
23242,2003-05-23 15:22:00,0,condition_1


In [14]:
from tsfresh.feature_extraction.settings import MinimalFCParameters, EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute

fc_parameters = EfficientFCParameters()

raw_df_features = []

for name, df in raw_df_grouped:
    df_features = tsf.extract_features(df, 
                                       column_id="number",
                                       column_sort="timestamp",
                                       column_value='activity',
                                       default_fc_parameters=fc_parameters)
    df_features['number'] = name
    raw_df_features.append(df_features)
    
raw_df_features = pd.concat(raw_df_features)
raw_df_features.reset_index(inplace=True, drop=True)

Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:21<00:00, 21.75s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:17<00:00, 17.20s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:16<00:00, 16.76s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [01:00<00:00, 60.73s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:20<00:00, 20.87s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.97s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:21<00:00, 21.96s/it]
Feature Extraction: 100%|██████████████████████████████████████████████████████████

In [32]:
raw_df_features['number'] = raw_df_features['number'].astype("string")
raw_df_features.sort_values(by='number', inplace=True)
print(raw_df_features.info())
raw_df_features

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Columns: 782 entries, activity__variance_larger_than_standard_deviation to number
dtypes: float64(781), string(1)
memory usage: 336.4 KB
None


Unnamed: 0,activity__variance_larger_than_standard_deviation,activity__has_duplicate_max,activity__has_duplicate_min,activity__has_duplicate,activity__sum_values,activity__abs_energy,activity__mean_abs_change,activity__mean_change,activity__mean_second_derivative_central,activity__median,...,activity__permutation_entropy__dimension_6__tau_1,activity__permutation_entropy__dimension_7__tau_1,activity__query_similarity_count__query_None__threshold_0.0,"activity__matrix_profile__feature_""min""__threshold_0.98","activity__matrix_profile__feature_""max""__threshold_0.98","activity__matrix_profile__feature_""mean""__threshold_0.98","activity__matrix_profile__feature_""median""__threshold_0.98","activity__matrix_profile__feature_""25""__threshold_0.98","activity__matrix_profile__feature_""75""__threshold_0.98",number
0,1.0,0.0,1.0,1.0,3415660.0,2519061000.0,92.156133,0.022932,0.00839,9.0,...,4.790671,5.926574,,10.717293,35.042831,24.557072,24.854357,22.338944,27.377075,condition_1
1,1.0,1.0,1.0,1.0,1517859.0,503540400.0,62.87192,0.015272,0.007636,3.0,...,5.036623,6.248507,,2.646326,18.547237,12.3805,12.587729,11.473409,13.519137,condition_10
2,1.0,0.0,1.0,1.0,3338367.0,2273304000.0,99.031512,-0.011729,0.004239,19.0,...,5.104396,6.332625,,2.872591,18.547237,11.886663,11.989251,10.80174,13.02943,condition_11
3,1.0,0.0,1.0,1.0,5981554.0,5178708000.0,82.280899,-0.037714,0.005935,9.0,...,4.437789,5.457963,,7.255049,35.584583,27.131746,27.614313,25.032306,30.14096,condition_12
4,1.0,0.0,1.0,1.0,1413779.0,644884900.0,40.059197,0.013697,0.000967,0.0,...,3.277779,3.974343,,8.831307,33.585711,24.640861,24.306735,21.811059,26.709031,condition_13
5,1.0,0.0,1.0,1.0,1628308.0,1076168000.0,56.492434,0.0,-0.008518,0.0,...,3.635424,4.4134,,2.789047,14.603056,10.305244,10.549469,9.206,11.65427,condition_14
6,1.0,0.0,1.0,1.0,3521753.0,1968854000.0,87.974186,0.0,0.003583,21.0,...,4.816745,5.942217,,2.564762,12.186397,8.741441,9.03611,7.549527,10.094201,condition_15
7,1.0,0.0,1.0,1.0,6379462.0,5313604000.0,104.934538,-0.005114,0.001286,0.0,...,4.134169,5.085019,,9.138115,40.543257,32.444698,34.470584,31.66141,36.17259,condition_16
8,1.0,0.0,1.0,1.0,5743208.0,4985132000.0,123.785929,0.0,-2.3e-05,50.0,...,5.378909,6.704499,,6.865422,28.142495,18.965327,19.404571,17.446783,20.854436,condition_17
9,1.0,0.0,1.0,1.0,5925033.0,5917171000.0,150.378381,-0.02366,-0.002946,61.0,...,5.143449,6.384837,,7.652022,35.700874,29.149489,29.668191,27.177986,31.535803,condition_18


In [16]:
raw_df_features.to_csv('pre_impute_efficient.csv')

In [31]:
raw_df_features

Unnamed: 0,activity__variance_larger_than_standard_deviation,activity__has_duplicate_max,activity__has_duplicate_min,activity__has_duplicate,activity__sum_values,activity__abs_energy,activity__mean_abs_change,activity__mean_change,activity__mean_second_derivative_central,activity__median,...,activity__permutation_entropy__dimension_6__tau_1,activity__permutation_entropy__dimension_7__tau_1,activity__query_similarity_count__query_None__threshold_0.0,"activity__matrix_profile__feature_""min""__threshold_0.98","activity__matrix_profile__feature_""max""__threshold_0.98","activity__matrix_profile__feature_""mean""__threshold_0.98","activity__matrix_profile__feature_""median""__threshold_0.98","activity__matrix_profile__feature_""25""__threshold_0.98","activity__matrix_profile__feature_""75""__threshold_0.98",number
0,1.0,0.0,1.0,1.0,3415660.0,2519061000.0,92.156133,0.022932,0.00839,9.0,...,4.790671,5.926574,,10.717293,35.042831,24.557072,24.854357,22.338944,27.377075,condition_1
1,1.0,1.0,1.0,1.0,1517859.0,503540400.0,62.87192,0.015272,0.007636,3.0,...,5.036623,6.248507,,2.646326,18.547237,12.3805,12.587729,11.473409,13.519137,condition_10
2,1.0,0.0,1.0,1.0,3338367.0,2273304000.0,99.031512,-0.011729,0.004239,19.0,...,5.104396,6.332625,,2.872591,18.547237,11.886663,11.989251,10.80174,13.02943,condition_11
3,1.0,0.0,1.0,1.0,5981554.0,5178708000.0,82.280899,-0.037714,0.005935,9.0,...,4.437789,5.457963,,7.255049,35.584583,27.131746,27.614313,25.032306,30.14096,condition_12
4,1.0,0.0,1.0,1.0,1413779.0,644884900.0,40.059197,0.013697,0.000967,0.0,...,3.277779,3.974343,,8.831307,33.585711,24.640861,24.306735,21.811059,26.709031,condition_13
5,1.0,0.0,1.0,1.0,1628308.0,1076168000.0,56.492434,0.0,-0.008518,0.0,...,3.635424,4.4134,,2.789047,14.603056,10.305244,10.549469,9.206,11.65427,condition_14
6,1.0,0.0,1.0,1.0,3521753.0,1968854000.0,87.974186,0.0,0.003583,21.0,...,4.816745,5.942217,,2.564762,12.186397,8.741441,9.03611,7.549527,10.094201,condition_15
7,1.0,0.0,1.0,1.0,6379462.0,5313604000.0,104.934538,-0.005114,0.001286,0.0,...,4.134169,5.085019,,9.138115,40.543257,32.444698,34.470584,31.66141,36.17259,condition_16
8,1.0,0.0,1.0,1.0,5743208.0,4985132000.0,123.785929,0.0,-2.3e-05,50.0,...,5.378909,6.704499,,6.865422,28.142495,18.965327,19.404571,17.446783,20.854436,condition_17
9,1.0,0.0,1.0,1.0,5925033.0,5917171000.0,150.378381,-0.02366,-0.002946,61.0,...,5.143449,6.384837,,7.652022,35.700874,29.149489,29.668191,27.177986,31.535803,condition_18


In [36]:
# Get ids as separate DataFrame
raw_df_features_ids = pd.DataFrame(raw_df_features['number'].copy())
raw_df_features_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   number  55 non-null     string
dtypes: string(1)
memory usage: 880.0 bytes


In [39]:
# Drop ids from origin DataFrame before imputing NaNs
raw_df_features.drop(columns='number', inplace=True)
raw_df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Columns: 781 entries, activity__variance_larger_than_standard_deviation to activity__matrix_profile__feature_"75"__threshold_0.98
dtypes: float64(781)
memory usage: 336.0 KB


In [40]:
# Replace all NaNs values using impute
impute(raw_df_features)
raw_df_features

 'activity__friedrich_coefficients__coeff_1__m_3__r_30'
 'activity__friedrich_coefficients__coeff_2__m_3__r_30'
 'activity__friedrich_coefficients__coeff_3__m_3__r_30'
 'activity__max_langevin_fixed_point__m_3__r_30'
 'activity__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


Unnamed: 0,activity__variance_larger_than_standard_deviation,activity__has_duplicate_max,activity__has_duplicate_min,activity__has_duplicate,activity__sum_values,activity__abs_energy,activity__mean_abs_change,activity__mean_change,activity__mean_second_derivative_central,activity__median,...,activity__permutation_entropy__dimension_5__tau_1,activity__permutation_entropy__dimension_6__tau_1,activity__permutation_entropy__dimension_7__tau_1,activity__query_similarity_count__query_None__threshold_0.0,"activity__matrix_profile__feature_""min""__threshold_0.98","activity__matrix_profile__feature_""max""__threshold_0.98","activity__matrix_profile__feature_""mean""__threshold_0.98","activity__matrix_profile__feature_""median""__threshold_0.98","activity__matrix_profile__feature_""25""__threshold_0.98","activity__matrix_profile__feature_""75""__threshold_0.98"
0,1.0,0.0,1.0,1.0,3415660.0,2519061000.0,92.156133,0.022932,0.00839,9.0,...,3.62059,4.790671,5.926574,0.0,10.717293,35.042831,24.557072,24.854357,22.338944,27.377075
1,1.0,1.0,1.0,1.0,1517859.0,503540400.0,62.87192,0.015272,0.007636,3.0,...,3.791181,5.036623,6.248507,0.0,2.646326,18.547237,12.3805,12.587729,11.473409,13.519137
2,1.0,0.0,1.0,1.0,3338367.0,2273304000.0,99.031512,-0.011729,0.004239,19.0,...,3.839281,5.104396,6.332625,0.0,2.872591,18.547237,11.886663,11.989251,10.80174,13.02943
3,1.0,0.0,1.0,1.0,5981554.0,5178708000.0,82.280899,-0.037714,0.005935,9.0,...,3.402544,4.437789,5.457963,0.0,7.255049,35.584583,27.131746,27.614313,25.032306,30.14096
4,1.0,0.0,1.0,1.0,1413779.0,644884900.0,40.059197,0.013697,0.000967,0.0,...,2.532537,3.277779,3.974343,0.0,8.831307,33.585711,24.640861,24.306735,21.811059,26.709031
5,1.0,0.0,1.0,1.0,1628308.0,1076168000.0,56.492434,0.0,-0.008518,0.0,...,2.791675,3.635424,4.4134,0.0,2.789047,14.603056,10.305244,10.549469,9.206,11.65427
6,1.0,0.0,1.0,1.0,3521753.0,1968854000.0,87.974186,0.0,0.003583,21.0,...,3.639686,4.816745,5.942217,0.0,2.564762,12.186397,8.741441,9.03611,7.549527,10.094201
7,1.0,0.0,1.0,1.0,6379462.0,5313604000.0,104.934538,-0.005114,0.001286,0.0,...,3.155291,4.134169,5.085019,0.0,9.138115,40.543257,32.444698,34.470584,31.66141,36.17259
8,1.0,0.0,1.0,1.0,5743208.0,4985132000.0,123.785929,0.0,-2.3e-05,50.0,...,4.027684,5.378909,6.704499,0.0,6.865422,28.142495,18.965327,19.404571,17.446783,20.854436
9,1.0,0.0,1.0,1.0,5925033.0,5917171000.0,150.378381,-0.02366,-0.002946,61.0,...,3.871373,5.143449,6.384837,0.0,7.652022,35.700874,29.149489,29.668191,27.177986,31.535803


In [41]:
raw_df_features.to_csv('post_impute_efficient.csv')
raw_df_features_ids.to_csv('number_ids.csv')

In [58]:
raw_df_features_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   number  55 non-null     string
dtypes: string(1)
memory usage: 880.0 bytes


In [60]:
raw_df_features_ids['ill'] = np.where(raw_df_features_ids['number'].str.startswith('control'), 0, 1)
raw_df_features_ids

Unnamed: 0,number,ill
0,condition_1,1
1,condition_10,1
2,condition_11,1
3,condition_12,1
4,condition_13,1
5,condition_14,1
6,condition_15,1
7,condition_16,1
8,condition_17,1
9,condition_18,1


In [61]:
raw_df_features_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   number  55 non-null     string
 1   ill     55 non-null     int32 
dtypes: int32(1), string(1)
memory usage: 1.1 KB


In [62]:
raw_df_features_relevant = tsf.select_features(raw_df_features, raw_df_features_ids['ill'])
raw_df_features_relevant

Unnamed: 0,activity__lempel_ziv_complexity__bins_2,activity__lempel_ziv_complexity__bins_3,activity__count_above_mean,"activity__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""var""","activity__linear_trend__attr_""intercept""","activity__agg_linear_trend__attr_""intercept""__chunk_len_5__f_agg_""max""","activity__agg_linear_trend__attr_""intercept""__chunk_len_5__f_agg_""mean""","activity__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""mean""","activity__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""mean""","activity__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""max""",activity__energy_ratio_by_chunks__num_segments_10__segment_focus_1,"activity__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""max""","activity__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""var""","activity__agg_linear_trend__attr_""intercept""__chunk_len_5__f_agg_""min"""
0,0.061521,0.064017,6328.0,37334.846538,171.032719,328.414831,171.028136,171.024387,171.01832,434.740982,0.059432,794.9752,29838.955311,58.475659
1,0.081604,0.086007,6148.0,14324.740705,75.974977,187.277463,75.946643,75.941083,75.978374,270.287119,0.091301,527.303147,11813.63749,12.630048
2,0.066412,0.066412,5761.0,59661.371852,157.247675,313.718814,157.306488,157.526885,157.381583,434.807369,0.033183,916.733687,39074.423243,54.105278
3,0.007193,0.009402,9271.0,84905.970772,273.996475,491.536353,273.990744,273.864968,273.972781,636.305201,0.096503,1089.960926,53657.078556,120.185709
4,0.053933,0.055093,4850.0,28300.660399,94.55737,200.62464,94.502275,94.487249,94.532415,282.827388,0.119696,640.35873,16439.891341,27.472213
5,0.073315,0.073754,3842.0,22520.259416,65.651816,132.332307,65.681956,65.719695,65.664485,192.314745,0.057323,481.69603,13029.474278,24.320273
6,0.05801,0.059296,7007.0,28484.454677,159.476385,265.614673,159.506574,159.901615,159.556657,348.97031,0.075507,640.954653,17450.851642,82.825897
7,0.048499,0.048849,9322.0,95654.109063,308.51855,533.24204,308.505643,308.234005,308.51581,688.772268,0.064114,1254.319543,59680.279305,143.768922
8,0.032474,0.035846,6872.0,60848.463481,205.044401,338.356997,205.113531,205.255393,205.110299,432.646079,0.047123,815.29723,29676.838134,106.262105
9,0.056226,0.056829,6576.0,54088.927202,196.748169,388.789398,196.893988,198.365849,196.909091,512.191208,0.01823,956.392781,33081.930287,71.821251


In [63]:
raw_df_features_relevant.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 14 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   activity__lempel_ziv_complexity__bins_2                                   55 non-null     float64
 1   activity__lempel_ziv_complexity__bins_3                                   55 non-null     float64
 2   activity__count_above_mean                                                55 non-null     float64
 3   activity__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"var"   55 non-null     float64
 4   activity__linear_trend__attr_"intercept"                                  55 non-null     float64
 5   activity__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"max"    55 non-null     float64
 6   activity__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"me

In [64]:
raw_df_features_relevant.to_csv('raw_df_features_relevant')