# Feature extraction settings

In [2]:
from tsfresh.feature_extraction import extract_features
from tsfresh import extract_relevant_features
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters
from tsfresh.feature_extraction.settings import from_columns

import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({"id": ["a", "a", "b", "b"], "temperature": [1,2,3,1], "pressure": [-1, 2, -1, 7]})
df

Unnamed: 0,id,temperature,pressure
0,a,1,-1
1,a,2,2
2,b,3,-1
3,b,1,7


# Minimal FC Parameters

child class of the ComprehensiveFCParameters class and has the same functionality as its base class. The only difference is, that most of the feature calculators are disabled and only a small subset of calculators will be calculated at all.

These are the calculated features in minimal:

In [4]:
settings_minimal = MinimalFCParameters()
settings_minimal

{'sum_values': None,
 'median': None,
 'mean': None,
 'length': None,
 'standard_deviation': None,
 'variance': None,
 'maximum': None,
 'minimum': None}

In [5]:
X_tsfresh = extract_features(df, column_id='id', default_fc_parameters=settings_minimal)
X_tsfresh.head()

Feature Extraction: 100%|██████████| 4/4 [00:00<00:00, 1773.87it/s]


variable,pressure__length,pressure__maximum,pressure__mean,pressure__median,pressure__minimum,pressure__standard_deviation,pressure__sum_values,pressure__variance,temperature__length,temperature__maximum,temperature__mean,temperature__median,temperature__minimum,temperature__standard_deviation,temperature__sum_values,temperature__variance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
a,2.0,2.0,0.5,0.5,-1.0,1.5,1.0,2.25,2.0,2.0,1.5,1.5,1.0,0.5,3.0,0.25
b,2.0,7.0,3.0,3.0,-1.0,4.0,6.0,16.0,2.0,3.0,2.0,2.0,1.0,1.0,4.0,1.0


## Removing a feature
* lets say we want to remove length

In [6]:
del settings_minimal["length"]
settings_minimal

{'sum_values': None,
 'median': None,
 'mean': None,
 'standard_deviation': None,
 'variance': None,
 'maximum': None,
 'minimum': None}

In [7]:
X_tsfresh = extract_features(df, column_id="id", default_fc_parameters = settings_minimal)
X_tsfresh.head()

Feature Extraction: 100%|██████████| 4/4 [00:00<00:00, 2123.97it/s]


variable,pressure__maximum,pressure__mean,pressure__median,pressure__minimum,pressure__standard_deviation,pressure__sum_values,pressure__variance,temperature__maximum,temperature__mean,temperature__median,temperature__minimum,temperature__standard_deviation,temperature__sum_values,temperature__variance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
a,2.0,0.5,0.5,-1.0,1.5,1.0,2.25,2.0,1.5,1.5,1.0,0.5,3.0,0.25
b,7.0,3.0,3.0,-1.0,4.0,6.0,16.0,3.0,2.0,2.0,1.0,1.0,4.0,1.0


# Different features for different columns
Let's say we want different features calculated for the columns separately

In [8]:
pressure_params = {
    "length": None,
    "sum_values": None
}

temp_params = {
    "maximum":None,
    "minimum": None
}

kind_to_fc_parameters = {
    "temperature": temp_params,
    "pressure": pressure_params
}

In [9]:
X_tsfresh = extract_features(df, column_id="id", kind_to_fc_parameters = kind_to_fc_parameters)
X_tsfresh

Feature Extraction: 100%|██████████| 4/4 [00:00<00:00, 1120.57it/s]


variable,pressure__length,pressure__sum_values,temperature__maximum,temperature__minimum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,2.0,1.0,2.0,1.0
b,2.0,6.0,3.0,1.0


## Getting the settings dict from columns

In [10]:
recovered_settings = from_columns(X_tsfresh.columns)

In [11]:
recovered_settings

{'pressure': {'length': None, 'sum_values': None},
 'temperature': {'maximum': None, 'minimum': None}}

# Other complex settings dictionaries

## If do not specify default_fc_parameters
tsfresh will generate 794 features per column

In [31]:
extract_features(df, column_id="id")

Feature Extraction: 100%|██████████| 4/4 [00:00<00:00, 20.60it/s]


variable,pressure__abs_energy,pressure__absolute_sum_of_changes,"pressure__agg_autocorrelation__f_agg_""mean""__maxlag_40","pressure__agg_autocorrelation__f_agg_""median""__maxlag_40","pressure__agg_autocorrelation__f_agg_""var""__maxlag_40","pressure__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","pressure__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","pressure__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","pressure__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","pressure__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,temperature__symmetry_looking__r_0.9,temperature__symmetry_looking__r_0.9500000000000001,temperature__time_reversal_asymmetry_statistic__lag_1,temperature__time_reversal_asymmetry_statistic__lag_2,temperature__time_reversal_asymmetry_statistic__lag_3,temperature__value_count__value_-1,temperature__value_count__value_0,temperature__value_count__value_1,temperature__variance,temperature__variance_larger_than_standard_deviation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,5.0,3.0,-1.0,-1.0,0.0,,,,,,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0
b,50.0,8.0,-1.0,-1.0,0.0,,,,,,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


## Efficient FC Params

The EfficientFCParameters contain features and parameters that should be calculated quite fastly:

In [12]:
settings_efficient = EfficientFCParameters()

In [16]:
print(f"There are {len(list(settings_efficient.keys()))} features")
print(list(settings_efficient.keys()))

There are 61 features
['variance_larger_than_standard_deviation', 'has_duplicate_max', 'has_duplicate_min', 'has_duplicate', 'sum_values', 'abs_energy', 'mean_abs_change', 'mean_change', 'mean_second_derivative_central', 'median', 'mean', 'length', 'standard_deviation', 'variance', 'skewness', 'kurtosis', 'absolute_sum_of_changes', 'longest_strike_below_mean', 'longest_strike_above_mean', 'count_above_mean', 'count_below_mean', 'last_location_of_maximum', 'first_location_of_maximum', 'last_location_of_minimum', 'first_location_of_minimum', 'percentage_of_reoccurring_datapoints_to_all_datapoints', 'percentage_of_reoccurring_values_to_all_values', 'sum_of_reoccurring_values', 'sum_of_reoccurring_data_points', 'ratio_value_number_to_time_series_length', 'maximum', 'minimum', 'time_reversal_asymmetry_statistic', 'c3', 'cid_ce', 'symmetry_looking', 'large_standard_deviation', 'quantile', 'autocorrelation', 'agg_autocorrelation', 'partial_autocorrelation', 'number_cwt_peaks', 'number_peaks',

In [19]:
settings_efficient

{'variance_larger_than_standard_deviation': None,
 'has_duplicate_max': None,
 'has_duplicate_min': None,
 'has_duplicate': None,
 'sum_values': None,
 'abs_energy': None,
 'mean_abs_change': None,
 'mean_change': None,
 'mean_second_derivative_central': None,
 'median': None,
 'mean': None,
 'length': None,
 'standard_deviation': None,
 'variance': None,
 'skewness': None,
 'kurtosis': None,
 'absolute_sum_of_changes': None,
 'longest_strike_below_mean': None,
 'longest_strike_above_mean': None,
 'count_above_mean': None,
 'count_below_mean': None,
 'last_location_of_maximum': None,
 'first_location_of_maximum': None,
 'last_location_of_minimum': None,
 'first_location_of_minimum': None,
 'percentage_of_reoccurring_datapoints_to_all_datapoints': None,
 'percentage_of_reoccurring_values_to_all_values': None,
 'sum_of_reoccurring_values': None,
 'sum_of_reoccurring_data_points': None,
 'ratio_value_number_to_time_series_length': None,
 'maximum': None,
 'minimum': None,
 'time_reversal_

## Comprehensive FC Params

You see those parameters as values in the fc_paramter dictionary? Those are the parameters of the feature extraction methods.

In detail, the value in a fc_parameters dicitonary can contain a list of dictionaries. **Every dictionary in that list is one feature.** This is the main difference of comprehensive and params

In [17]:
settings_comprehensive = ComprehensiveFCParameters()

In [18]:
print(f"There are {len(list(settings_comprehensive.keys()))} features")
print(list(settings_comprehensive.keys()))

There are 63 features
['variance_larger_than_standard_deviation', 'has_duplicate_max', 'has_duplicate_min', 'has_duplicate', 'sum_values', 'abs_energy', 'mean_abs_change', 'mean_change', 'mean_second_derivative_central', 'median', 'mean', 'length', 'standard_deviation', 'variance', 'skewness', 'kurtosis', 'absolute_sum_of_changes', 'longest_strike_below_mean', 'longest_strike_above_mean', 'count_above_mean', 'count_below_mean', 'last_location_of_maximum', 'first_location_of_maximum', 'last_location_of_minimum', 'first_location_of_minimum', 'percentage_of_reoccurring_datapoints_to_all_datapoints', 'percentage_of_reoccurring_values_to_all_values', 'sum_of_reoccurring_values', 'sum_of_reoccurring_data_points', 'ratio_value_number_to_time_series_length', 'sample_entropy', 'maximum', 'minimum', 'time_reversal_asymmetry_statistic', 'c3', 'cid_ce', 'symmetry_looking', 'large_standard_deviation', 'quantile', 'autocorrelation', 'agg_autocorrelation', 'partial_autocorrelation', 'number_cwt_peaks

In [20]:
settings_comprehensive

{'variance_larger_than_standard_deviation': None,
 'has_duplicate_max': None,
 'has_duplicate_min': None,
 'has_duplicate': None,
 'sum_values': None,
 'abs_energy': None,
 'mean_abs_change': None,
 'mean_change': None,
 'mean_second_derivative_central': None,
 'median': None,
 'mean': None,
 'length': None,
 'standard_deviation': None,
 'variance': None,
 'skewness': None,
 'kurtosis': None,
 'absolute_sum_of_changes': None,
 'longest_strike_below_mean': None,
 'longest_strike_above_mean': None,
 'count_above_mean': None,
 'count_below_mean': None,
 'last_location_of_maximum': None,
 'first_location_of_maximum': None,
 'last_location_of_minimum': None,
 'first_location_of_minimum': None,
 'percentage_of_reoccurring_datapoints_to_all_datapoints': None,
 'percentage_of_reoccurring_values_to_all_values': None,
 'sum_of_reoccurring_values': None,
 'sum_of_reoccurring_data_points': None,
 'ratio_value_number_to_time_series_length': None,
 'sample_entropy': None,
 'maximum': None,
 'minimum

In [21]:
settings_comprehensive['large_standard_deviation']

[{'r': 0.05},
 {'r': 0.1},
 {'r': 0.15000000000000002},
 {'r': 0.2},
 {'r': 0.25},
 {'r': 0.30000000000000004},
 {'r': 0.35000000000000003},
 {'r': 0.4},
 {'r': 0.45},
 {'r': 0.5},
 {'r': 0.55},
 {'r': 0.6000000000000001},
 {'r': 0.65},
 {'r': 0.7000000000000001},
 {'r': 0.75},
 {'r': 0.8},
 {'r': 0.8500000000000001},
 {'r': 0.9},
 {'r': 0.9500000000000001}]

The above in indicates that 19 features will be calculated for "Large standard deviation" for each column

In [23]:
settings_value_count = {'large_standard_deviation' : settings_comprehensive["large_standard_deviation"]
                       
                       }

In [24]:
settings_value_count

{'large_standard_deviation': [{'r': 0.05},
  {'r': 0.1},
  {'r': 0.15000000000000002},
  {'r': 0.2},
  {'r': 0.25},
  {'r': 0.30000000000000004},
  {'r': 0.35000000000000003},
  {'r': 0.4},
  {'r': 0.45},
  {'r': 0.5},
  {'r': 0.55},
  {'r': 0.6000000000000001},
  {'r': 0.65},
  {'r': 0.7000000000000001},
  {'r': 0.75},
  {'r': 0.8},
  {'r': 0.8500000000000001},
  {'r': 0.9},
  {'r': 0.9500000000000001}]}

In [25]:
X_tsfresh = extract_features(df, column_id="id", default_fc_parameters=settings_value_count)
X_tsfresh.head()


Feature Extraction: 100%|██████████| 4/4 [00:00<00:00, 1862.69it/s]


variable,pressure__large_standard_deviation__r_0.05,pressure__large_standard_deviation__r_0.1,pressure__large_standard_deviation__r_0.15000000000000002,pressure__large_standard_deviation__r_0.2,pressure__large_standard_deviation__r_0.25,pressure__large_standard_deviation__r_0.30000000000000004,pressure__large_standard_deviation__r_0.35000000000000003,pressure__large_standard_deviation__r_0.4,pressure__large_standard_deviation__r_0.45,pressure__large_standard_deviation__r_0.5,...,temperature__large_standard_deviation__r_0.5,temperature__large_standard_deviation__r_0.55,temperature__large_standard_deviation__r_0.6000000000000001,temperature__large_standard_deviation__r_0.65,temperature__large_standard_deviation__r_0.7000000000000001,temperature__large_standard_deviation__r_0.75,temperature__large_standard_deviation__r_0.8,temperature__large_standard_deviation__r_0.8500000000000001,temperature__large_standard_deviation__r_0.9,temperature__large_standard_deviation__r_0.9500000000000001
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Reconstructing how the features were calculated

In [26]:
from_columns(X_tsfresh.columns)

{'pressure': {'large_standard_deviation': [{'r': 0.05},
   {'r': 0.1},
   {'r': 0.15000000000000002},
   {'r': 0.2},
   {'r': 0.25},
   {'r': 0.30000000000000004},
   {'r': 0.35000000000000003},
   {'r': 0.4},
   {'r': 0.45},
   {'r': 0.5},
   {'r': 0.55},
   {'r': 0.6000000000000001},
   {'r': 0.65},
   {'r': 0.7000000000000001},
   {'r': 0.75},
   {'r': 0.8},
   {'r': 0.8500000000000001},
   {'r': 0.9},
   {'r': 0.9500000000000001}]},
 'temperature': {'large_standard_deviation': [{'r': 0.05},
   {'r': 0.1},
   {'r': 0.15000000000000002},
   {'r': 0.2},
   {'r': 0.25},
   {'r': 0.30000000000000004},
   {'r': 0.35000000000000003},
   {'r': 0.4},
   {'r': 0.45},
   {'r': 0.5},
   {'r': 0.55},
   {'r': 0.6000000000000001},
   {'r': 0.65},
   {'r': 0.7000000000000001},
   {'r': 0.75},
   {'r': 0.8},
   {'r': 0.8500000000000001},
   {'r': 0.9},
   {'r': 0.9500000000000001}]}}

**This means that you should never change a column name. Otherwise the information how it was calculated can get lost.**