# tsfresh

First prepare data to be in a format compatibile with tsfresh library.
Two approaches will be tested:
- Extracting features from raw signal (minute interval)
- Extracting features from aggregated signal (hour interval)

Result of this extraction will be a set of features for each patient.

In [1]:
import tsfresh as tsf

import os
from typing import Callable, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp

Load data from the csv directory

In [6]:
data_dir = "./depresjon_data"
condition_dir = os.path.join(data_dir, "condition")
control_dir = os.path.join(data_dir, "control")

conditions = [pd.read_csv(os.path.join(condition_dir, filename)) for filename in os.listdir(condition_dir)]
controls = [pd.read_csv(os.path.join(control_dir, filename)) for filename in os.listdir(control_dir)]

### Feature selection on raw signal with minute interval 

Transform loaded data into single DataFrame with columns in proper format.

In [7]:
def get_raw_dataframe():
    f_frames = []

    for c_i, c in enumerate(conditions, start=1):
        c_copy = c.copy()
        c_copy['number'] = f'condition_{c_i}'
        f_frames.append(c_copy)

    for c_i, c in enumerate(controls, start=1):
        c_copy = c.copy()
        c_copy['number'] = f'control_{c_i}'
        f_frames.append(c_copy)

    f_df = pd.concat(f_frames)
    f_df.reset_index(inplace=True, drop=True)
    f_df.drop(inplace=True, columns='date')
    f_df['timestamp']= pd.to_datetime(f_df['timestamp'])
    f_df['number'] = f_df['number'].astype('string')
    return f_df

In [4]:
raw_df = get_raw_dataframe()
raw_df

Unnamed: 0,timestamp,activity,number
0,2003-05-07 12:00:00,0,condition_1
1,2003-05-07 12:01:00,143,condition_1
2,2003-05-07 12:02:00,0,condition_1
3,2003-05-07 12:03:00,20,condition_1
4,2003-05-07 12:04:00,166,condition_1
...,...,...,...
1571701,2003-12-01 12:53:00,7,control_32
1571702,2003-12-01 12:54:00,7,control_32
1571703,2003-12-01 12:55:00,5,control_32
1571704,2003-12-01 12:56:00,5,control_32


In [5]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571706 entries, 0 to 1571705
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   timestamp  1571706 non-null  datetime64[ns]
 1   activity   1571706 non-null  int64         
 2   number     1571706 non-null  string        
dtypes: datetime64[ns](1), int64(1), string(1)
memory usage: 36.0 MB


Now create separate DataFrames for the following datasets:
* day data
* night data
* all data

In [8]:
def get_night_day_division(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    night_df = df.loc[(df["timestamp"].dt.hour >= 21) | (df["timestamp"].dt.hour < 8)]
    day_df = df.loc[(df["timestamp"].dt.hour >= 8) & (df["timestamp"].dt.hour < 21)]
    return night_df, day_df

In [9]:
raw_df_night, raw_df_day = get_night_day_division(raw_df)
print('day data: %d' % len(raw_df_day))
print('night data: %d' % len(raw_df_night))
print('all data: %d' % len(raw_df))

day data: 853679
night data: 718027
all data: 1571706


In [10]:
def group_by_number(df: pd.DataFrame):
    return list(df.groupby('number'))

In [11]:
raw_df_grouped = group_by_number(raw_df)
raw_df_grouped[0][1]

Unnamed: 0,timestamp,activity,number
0,2003-05-07 12:00:00,0,condition_1
1,2003-05-07 12:01:00,143,condition_1
2,2003-05-07 12:02:00,0,condition_1
3,2003-05-07 12:03:00,20,condition_1
4,2003-05-07 12:04:00,166,condition_1
...,...,...,...
23239,2003-05-23 15:19:00,0,condition_1
23240,2003-05-23 15:20:00,0,condition_1
23241,2003-05-23 15:21:00,0,condition_1
23242,2003-05-23 15:22:00,0,condition_1


In [12]:
from tsfresh.feature_extraction.settings import MinimalFCParameters, EfficientFCParameters
fc_parameters = EfficientFCParameters()

In [13]:
def extract_features_grouped(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts features separately for each 
    """
    df_features_all = []
    df_grouped = group_by_number(df)

    for name, df in df_grouped:
        df_features = tsf.extract_features(df, 
                                           column_id='number',
                                           column_sort='timestamp',
                                           column_value='activity',
                                           default_fc_parameters=fc_parameters)
        df_features['number'] = name
        df_features_all.append(df_features)
        
    df_features_all = pd.concat(df_features_all)
    df_features_all.reset_index(inplace=True, drop=True)
    df_features_all.sort_values(by='number', inplace=True)
    return df_features_all

Use **tsfresh** to extract as many features as possible from every dataset. 

In [14]:
# features_df_day = extract_features_grouped(raw_df_day)
# features_df_night = extract_features_grouped(raw_df_night)
# features_df_all = extract_features_grouped(raw_df)

# features_dfs = {
#     'day': features_df_day,
#     'night': features_df_night,
#     'all': features_df_all
# }

In [15]:
# for name, df in features_dfs.items():
#     df['number'] = df['number'].astype('string')
#     print(df.info())
#     df.to_csv(f'csv/features_{name}.csv')

In [16]:
# features_dfs['day']

## Dask
The same procedure as above but using Dask library which is less likely to have problems with memory consumption and seems to be more optimized for large datasets.

In [17]:
import dask.dataframe as dd
from tsfresh.convenience.bindings import dask_feature_extraction_on_chunk

def extract_features_grouped_dask(df: pd.DataFrame) -> pd.DataFrame:
    df_dask = dd.from_pandas(df)
    df_dask_grouped = df_dask.groupby('number')
    df_features = dask_feature_extraction_on_chunk(df_dask_grouped,
                                                   column_id='number',
                                                   column_sort='timestamp',
                                                   column_value='activity',
                                                   default_fc_parameters=fc_parameters)
    
    return df_features.compute()

In [18]:
features_df_day_dask = extract_features_grouped(raw_df_day)
features_df_day_dask

Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.60s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.35s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.18s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.20s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.65s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.51s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.98s/it]
Feature Extraction: 100%|██████████████████████████████████████████████████████████

Unnamed: 0,activity__variance_larger_than_standard_deviation,activity__has_duplicate_max,activity__has_duplicate_min,activity__has_duplicate,activity__sum_values,activity__abs_energy,activity__mean_abs_change,activity__mean_change,activity__mean_second_derivative_central,activity__median,...,activity__permutation_entropy__dimension_6__tau_1,activity__permutation_entropy__dimension_7__tau_1,activity__query_similarity_count__query_None__threshold_0.0,"activity__matrix_profile__feature_""min""__threshold_0.98","activity__matrix_profile__feature_""max""__threshold_0.98","activity__matrix_profile__feature_""mean""__threshold_0.98","activity__matrix_profile__feature_""median""__threshold_0.98","activity__matrix_profile__feature_""25""__threshold_0.98","activity__matrix_profile__feature_""75""__threshold_0.98",number
0,1.0,0.0,1.0,1.0,3086166.0,2377478000.0,140.669637,0.042025,0.015376,107.0,...,5.825517,7.241086,,6.223065,27.276363,20.440264,20.903214,19.774763,21.764174,condition_1
1,1.0,0.0,1.0,1.0,830233.0,300944400.0,60.460423,0.028482,0.014242,2.0,...,4.904311,5.980739,,2.553003,14.437611,11.136572,11.411675,10.062228,12.293606,condition_10
2,1.0,0.0,1.0,1.0,2754639.0,1930651000.0,139.912357,-0.021977,0.007944,100.0,...,5.940682,7.382456,,1.94218,12.328828,6.931623,6.930218,6.169701,7.687378,condition_11
3,1.0,0.0,1.0,1.0,4659422.0,4313744000.0,112.539398,-0.069557,0.010946,12.0,...,4.82888,5.929028,,21.896525,45.264529,38.09418,38.281138,36.159086,40.200026,condition_12
4,1.0,0.0,1.0,1.0,1137805.0,494259800.0,57.728197,0.025347,0.00179,0.0,...,4.387996,5.337893,,17.094772,29.051678,22.95423,22.128888,21.038694,23.300264,condition_13
5,1.0,0.0,1.0,1.0,1428931.0,979421900.0,82.339676,0.0,-0.015518,0.0,...,4.360083,5.282457,,3.209742,12.689444,9.147689,9.290126,8.150593,10.222997,condition_14
6,1.0,0.0,1.0,1.0,2864774.0,1591982000.0,119.194845,0.0,0.006571,142.0,...,5.861688,7.274962,,2.066396,10.307194,7.869323,8.018337,7.320879,8.643916,condition_15
7,1.0,0.0,1.0,1.0,4754567.0,4032666000.0,137.587656,-0.009491,0.002388,83.0,...,4.83802,5.947693,,4.923813,29.026766,23.264079,24.904404,23.011859,26.112419,condition_16
8,1.0,0.0,1.0,1.0,4461814.0,4148790000.0,165.070231,0.0,-4.3e-05,208.0,...,5.803294,7.194772,,3.973526,17.888544,10.931357,11.146926,9.866287,12.097632,condition_17
9,1.0,0.0,1.0,1.0,4328594.0,4561466000.0,193.391334,-0.043758,-0.005449,158.0,...,5.526885,6.816672,,4.133425,20.880613,14.096047,14.526995,12.955182,15.695653,condition_18


As we can see, 780 features has been extracted:

In [19]:
features_df_day_dask.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Columns: 782 entries, activity__variance_larger_than_standard_deviation to number
dtypes: float64(781), object(1)
memory usage: 336.4+ KB


Run feature extraction for the rest of the datasets:

In [20]:
# features_df_day_dask = extract_features_grouped(raw_df_day)
features_df_night_dask = extract_features_grouped(raw_df_night)
features_df_all_dask = extract_features_grouped(raw_df)

features_dfs_dask = {
    'day': features_df_day_dask,
    'night': features_df_night_dask,
    'all': features_df_all_dask
}

Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.33s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.41s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.19s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.55s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.87s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.64s/it]
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.22s/it]
Feature Extraction: 100%|██████████████████████████████████████████████████████████

Save results:

In [44]:
for name, df in features_dfs_dask.items():
    print(df.info())
    df.to_csv(f'csv/features_dask_{name}.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Columns: 782 entries, activity__variance_larger_than_standard_deviation to number
dtypes: float64(781), object(1)
memory usage: 336.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Columns: 782 entries, activity__variance_larger_than_standard_deviation to number
dtypes: float64(781), object(1)
memory usage: 336.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Columns: 782 entries, activity__variance_larger_than_standard_deviation to number
dtypes: float64(781), object(1)
memory usage: 336.4+ KB
None


Some final processing includes extracting IDs from 'number' column to separate DataFrame to define column 'ill' determining class of the sample.

In [22]:
features_dfs = features_dfs_dask

In [23]:
def extract_ids(df: pd.DataFrame):
    ids = pd.DataFrame(df['number'].copy())
    ids['ill'] = np.where(ids['number'].str.startswith('control'), 0, 1)    
    without_ids = df.drop(columns='number')
    return ids, without_ids

In [24]:
features_id_dfs = {}
features_no_id_dfs = {}

for name, df in features_dfs.items():
    ids, without_ids = extract_ids(df)
    features_id_dfs[name] = ids
    features_no_id_dfs[name] = without_ids

In [25]:
print(features_no_id_dfs['all'].info())
print(features_id_dfs['all'].info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Columns: 781 entries, activity__variance_larger_than_standard_deviation to activity__matrix_profile__feature_"75"__threshold_0.98
dtypes: float64(781)
memory usage: 336.0 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   number  55 non-null     object
 1   ill     55 non-null     int32 
dtypes: int32(1), object(1)
memory usage: 1.1+ KB
None


It is also required to replace NaNs in the DataFrame before selecting most relevant features.

In [31]:
from tsfresh.utilities.dataframe_functions import impute

def impute_nans(df: pd.DataFrame):
    impute(df)
    
for df in features_no_id_dfs.values():
    impute_nans(df)

In [27]:
features_no_id_dfs['all']

Unnamed: 0,activity__variance_larger_than_standard_deviation,activity__has_duplicate_max,activity__has_duplicate_min,activity__has_duplicate,activity__sum_values,activity__abs_energy,activity__mean_abs_change,activity__mean_change,activity__mean_second_derivative_central,activity__median,...,activity__permutation_entropy__dimension_5__tau_1,activity__permutation_entropy__dimension_6__tau_1,activity__permutation_entropy__dimension_7__tau_1,activity__query_similarity_count__query_None__threshold_0.0,"activity__matrix_profile__feature_""min""__threshold_0.98","activity__matrix_profile__feature_""max""__threshold_0.98","activity__matrix_profile__feature_""mean""__threshold_0.98","activity__matrix_profile__feature_""median""__threshold_0.98","activity__matrix_profile__feature_""25""__threshold_0.98","activity__matrix_profile__feature_""75""__threshold_0.98"
0,1.0,0.0,1.0,1.0,3415660.0,2519061000.0,92.156133,0.022932,0.00839,9.0,...,3.62059,4.790671,5.926574,0.0,10.717293,35.042831,24.557072,24.854357,22.338944,27.377075
1,1.0,1.0,1.0,1.0,1517859.0,503540400.0,62.87192,0.015272,0.007636,3.0,...,3.791181,5.036623,6.248507,0.0,2.646326,18.547237,12.3805,12.587729,11.473409,13.519137
2,1.0,0.0,1.0,1.0,3338367.0,2273304000.0,99.031512,-0.011729,0.004239,19.0,...,3.839281,5.104396,6.332625,0.0,2.872591,18.547237,11.886663,11.989251,10.80174,13.02943
3,1.0,0.0,1.0,1.0,5981554.0,5178708000.0,82.280899,-0.037714,0.005935,9.0,...,3.402544,4.437789,5.457963,0.0,7.255049,35.584583,27.131746,27.614313,25.032306,30.14096
4,1.0,0.0,1.0,1.0,1413779.0,644884900.0,40.059197,0.013697,0.000967,0.0,...,2.532537,3.277779,3.974343,0.0,8.831307,33.585711,24.640861,24.306735,21.811059,26.709031
5,1.0,0.0,1.0,1.0,1628308.0,1076168000.0,56.492434,0.0,-0.008518,0.0,...,2.791675,3.635424,4.4134,0.0,2.789047,14.603056,10.305244,10.549469,9.206,11.65427
6,1.0,0.0,1.0,1.0,3521753.0,1968854000.0,87.974186,0.0,0.003583,21.0,...,3.639686,4.816745,5.942217,0.0,2.564762,12.186397,8.741441,9.03611,7.549527,10.094201
7,1.0,0.0,1.0,1.0,6379462.0,5313604000.0,104.934538,-0.005114,0.001286,0.0,...,3.155291,4.134169,5.085019,0.0,9.138115,40.543257,32.444698,34.470584,31.66141,36.17259
8,1.0,0.0,1.0,1.0,5743208.0,4985132000.0,123.785929,0.0,-2.3e-05,50.0,...,4.027684,5.378909,6.704499,0.0,6.865422,28.142495,18.965327,19.404571,17.446783,20.854436
9,1.0,0.0,1.0,1.0,5925033.0,5917171000.0,150.378381,-0.02366,-0.002946,61.0,...,3.871373,5.143449,6.384837,0.0,7.652022,35.700874,29.149489,29.668191,27.177986,31.535803


Finally, tsfresh library can be used to select most relevant features among all features.

In [45]:
def select_relevant_features(data_df_name, data_df, ids_df, csv=True):
    selected = tsf.select_features(data_df, ids_df['ill'])
    if csv:
        selected.to_csv(f'csv/selected_{data_df_name}.csv', index=False)
    return selected

In [46]:
features_selected = {}

for (name, data_df), ids_df in zip(features_no_id_dfs.items(), features_id_dfs.values()):
    features_selected[name] = select_relevant_features(name, data_df, ids_df)

In [30]:
for name, df in features_selected.items():
    print(f'Dataset: {name}')
    print(df.info())

Dataset: day
<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   activity__lempel_ziv_complexity__bins_2  55 non-null     float64
 1   activity__lempel_ziv_complexity__bins_3  55 non-null     float64
dtypes: float64(2)
memory usage: 1.3 KB
None
Dataset: night
<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 54
Data columns (total 47 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   activity__fft_coefficient__attr_"abs"__coeff_40                           55 non-null     float64
 1   activity__fft_coefficient__attr_"abs"__coeff_85                           55 non-null     float64
 2   activity__fft_coefficient

### Selected features
* Among selected features, there is a lot of linear trends and FFT (Fast Fourier Transform) coefficients, variously parametrized.
* Day part of the dataset has only 2 relevant features selected, night part has 47 relevant features and on the whole dataset 14 features were relevant enough to be selected.

### Feature selection on aggregated signal with hour interval 

We also try attempt where features are extracted from a aggregated signal where single point is a mean activity over an hour.

In [32]:
def group_by_hour(df: pd.DataFrame) -> pd.DataFrame:
    """
    df: DataFrame with columns "datetime" and "activity"
    """
    grouped = df.groupby([pd.Grouper(key="number"), pd.Grouper(key="timestamp", freq="H")]).mean()
    grouped.reset_index(inplace=True)
    return grouped

In [33]:
raw_df_hourly = group_by_hour(raw_df)
raw_df_hourly

Unnamed: 0,number,timestamp,activity
0,condition_1,2003-05-07 12:00:00,346.550000
1,condition_1,2003-05-07 13:00:00,284.566667
2,condition_1,2003-05-07 14:00:00,279.183333
3,condition_1,2003-05-07 15:00:00,218.783333
4,condition_1,2003-05-07 16:00:00,238.550000
...,...,...,...
26225,control_9,2003-12-29 06:00:00,0.000000
26226,control_9,2003-12-29 07:00:00,0.000000
26227,control_9,2003-12-29 08:00:00,30.100000
26228,control_9,2003-12-29 09:00:00,25.850000


In [34]:
raw_df_hourly_night, raw_df_hourly_day = get_night_day_division(raw_df_hourly)
print('day data: %d' % len(raw_df_hourly_day))
print('night data: %d' % len(raw_df_hourly_night))
print('all data: %d' % len(raw_df_hourly))

day data: 14262
night data: 11968
all data: 26230


In [35]:
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    df_features = tsf.extract_features(df,
                                       column_id='number',
                                       column_sort='timestamp',
                                       column_value='activity',
                                       default_fc_parameters=fc_parameters)
    df_features.reset_index(inplace=True)
    df_features['number'] = df_features['index'].astype('string')
    df_features.drop(columns='index', inplace=True)
    return df_features

In [36]:
hourly_features_dfs = {
    'day': extract_features(raw_df_hourly_day),
    'night': extract_features(raw_df_hourly_night),
    'all': extract_features(raw_df_hourly)
}

Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████| 28/28 [00:07<00:00,  3.90it/s]
Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████| 28/28 [00:07<00:00,  3.99it/s]
Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████| 28/28 [00:07<00:00,  3.67it/s]


In [37]:
hourly_features_dfs['all'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Columns: 782 entries, activity__variance_larger_than_standard_deviation to number
dtypes: float64(781), string(1)
memory usage: 336.1 KB


In [38]:
hourly_features_id_dfs = {}
hourly_features_no_id_dfs = {}

for name, df in hourly_features_dfs.items():
    ids, without_ids = extract_ids(df)
    hourly_features_id_dfs[name] = ids
    hourly_features_no_id_dfs[name] = without_ids

In [39]:
for df in hourly_features_no_id_dfs.values():
    impute_nans(df)



Default FDR is set to 5% but with this threshold **no features were selected as relevant enough.**

Thus, we increase FDR rate to 10% to get non-empty feature set.

In [47]:
hourly_features_relevant = {}

FDR = .1

for (name, data_df), ids_df in zip(hourly_features_no_id_dfs.items(), hourly_features_id_dfs.values()):
    hourly_features_relevant[name] = tsf.select_features(data_df, ids_df['ill'], fdr_level=FDR)

In [48]:
for name, df in hourly_features_relevant.items():
    df.to_csv(f'csv/selected_hourly_{name}.csv', index=False)

In [42]:
for name, df in hourly_features_relevant.items():
    print(f'Dataset: {name}')
    print(df.info())

Dataset: day
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 8 columns):
 #   Column                                                                   Non-Null Count  Dtype  
---  ------                                                                   --------------  -----  
 0   activity__partial_autocorrelation__lag_2                                 55 non-null     float64
 1   activity__energy_ratio_by_chunks__num_segments_10__segment_focus_1       55 non-null     float64
 2   activity__autocorrelation__lag_9                                         55 non-null     float64
 3   activity__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"min"  55 non-null     float64
 4   activity__benford_correlation                                            55 non-null     float64
 5   activity__fft_aggregated__aggtype_"variance"                             55 non-null     float64
 6   activity__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg

#### Selected features
* Among selected features on hourly-aggregated signal, there are some Benford correlation and autocorrelation features marked as relevant, but mostly there are FFT coefficients and linear trends.
* Selected relevant feature sets cardinality: day part - 8, night part - 61, all - 24.


In [49]:
hourly_features_id_dfs['all'].to_csv('csv/hourly_ids.csv', index=False)
features_id_dfs['all'].to_csv('csv/ids.csv', index=False)