In [None]:
from tsfresh import extract_features, extract_relevant_features
import pandas as pd

ROOT = "../"
DATA_DIR = "../data_28/" # dataset_{00..27}.csv

def get_i_datasets(numSets):
    ''' Returns an aggregated DataFrame object containing data from the first i datasets in ../data_28/ '''
    assert numSets in range(1, 29)
    
    master = pd.DataFrame(columns=['id', 'index', 'datetime', 'rtt']) # Create empty dataframe to append new DataFrames to.

    for i in range(numSets):
        f = DATA_DIR + f"dataset_{i:02d}.csv" # get file path
        df = pd.read_csv(f) # Read raw csv
        df.rename(columns = {'Unnamed: 0':'index'}, inplace = True) # rename index column to 'index'

        df.insert(0, 'id', i) # create and populate ID column as first entry in dataframe
#         df['rtt'] = df['rtt'].fillna(0) # replace NaN's with 0
#         print(df)
        master = master.append(df, ignore_index = True)
    return master

In [None]:
# DataFrames to extract features from
master = get_i_datasets(28)
half = get_i_datasets(14)
quarter = get_i_datasets(7)
tiny = get_i_datasets(3)

# master.describe()

In [None]:
# which dataset to use for right now

# df = tiny    # 3 
df = quarter # 7
# df = half    # 14
# df = master  # 28

# Get last entry in 'id' column, aka the number of datasets to loop through.
num = df.loc[df.index[-1], 'id']

In [None]:
# Print RTT diagrams of currently selected DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

FIGURE_DIR = "../data_28/figures/" # root/data_28/figures

for i in range(num + 1):
#     df[df['id'] == i]['rtt'].plot(use_index=False, x='index', subplots=True, sharex=True, figsize=(16,4))
    current_df = df[df['id'] == i]['rtt']
    current_df.plot(figsize=(16,4))
    
    plt.title(f"Dataset {i:02d} RTT", loc='left')
    plt.grid(axis='y', linestyle='-', linewidth=.4) # x, y, or both
    plt.xlabel('Measurement Index')
    plt.ylabel('Round Trip Time (ms)')
    # plt.yscale('log') # useful for datasets with extreme variations in RTTs
    
    # Uncomment to enable writing figures to FIGURE_DIR
    # plt.savefig(FIGURE_DIR + f'dataset{i:02d}.png', bbox_inches='tight')
    
    plt.show()

In [None]:
# Comprehensive Feature Extraction
# note: we don't need to include column_sort='datetime' since data is already sorted in ascending order

# features = extract_features(df, column_id="id", column_value="rtt")

In [None]:
disable_progress_bar = False

In [None]:
# Custom Feature Extraction
# https://tsfresh.readthedocs.io/en/latest/text/feature_extraction_settings.html

from tsfresh.feature_extraction import ComprehensiveFCParameters

# Create and assign custom settings
default = ComprehensiveFCParameters() # CompFCP defaults to extracting all default features
# custom = {
#     "length": None,
#     "large_standard_deviation": [{"r": 0.05}, {"r": 0.1}]
# }

# https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.mean_change

custom = {
    "length": None,                       # number of entries in each time series
#     "mean_change": None,                  # change in mean between subsequent time series
    "quantile": [{"q": 0.9}],             # calculates the q quantile of the series; the output x is the latency val for which q% of measurements are <= x
#     "ratio_beyond_r_sigma": [{"r": 1.0}], # ratio of measurements beyond r*std. dev. (sigma) from the mean
    "approximate_entropy": [{'m': 1, 'r': 1}],          # approximate entropy; https://en.wikipedia.org/wiki/Approximate_entropy
    "standard_deviation": None,           # standard deviation
    "mean": None
#     "variance": None,
#     "variance_larger_than_standard_deviation": None,
#     "variation_coefficient": None,        # standard error / mean
#     "autocorrelation": [{'lag': 1}],    # similarity between observations by the lag between them; https://en.wikipedia.org/wiki/Autocorrelation#Estimation
#     "binned_entropy": [{"max_bins": 100}]
}

# Perform feature extraction with custom settings
features = extract_features(df, default_fc_parameters=custom, column_id="id", column_sort="index", column_value="rtt", disable_progressbar=disable_progress_bar)

print(features)

In [None]:
# Custom Features
# https://tsfresh.readthedocs.io/en/latest/text/how_to_add_custom_feature.html

from tsfresh.feature_extraction.feature_calculators import set_property
from tsfresh.feature_extraction import feature_calculators
import numpy as np

@set_property("fctype", "simple")
def num_outages(x):
    """
    Returns the count of measurements reading 0 in a time series x.

    :param x: the time series to calculate the feature of
    :type x: numpy.ndarray
    :return: the value of this feature
    :return type: int
    """
    return np.count_nonzero(x == 0)

@set_property("fctype", "simple")
def count_nonzero(x):
    """ Returns the number of nonzero measurements in the time series x. """
    return np.count_nonzero(x)

@set_property("fctype", "simple")
def noise_threshold(x):
    """ Returns the noise threshold for a time series, based on the heuristic of 1.5 * the upper IQR. """
    return np.percentile(x, 75) * 1.5


# Add custom features to list of feature calculators:
feature_calculators.__dict__["num_outages"] = num_outages
feature_calculators.__dict__["count_nonzero"] = count_nonzero
feature_calculators.__dict__["noise_threshold"] = noise_threshold

In [None]:
# Custom Feature Extraction
# https://tsfresh.readthedocs.io/en/latest/text/feature_extraction_settings.html

custom = {
    "length": None,                       # number of entries in each time series
#     "quantile": [{"q": 0.9}],             # calculates the q quantile of the series; the output x is the latency val for which q% of measurements are <= x
#     "approximate_entropy": [{'m': 1, 'r': 1}],          # approximate entropy; https://en.wikipedia.org/wiki/Approximate_entropy
#     "standard_deviation": None,           # standard deviation
#     "mean": None,
#     "variance": None,
#     "variation_coefficient": None,        # standard error / mean
#     "binned_entropy": [{"max_bins": 100}],
    "num_outages": None,
    "count_nonzero": None,
    "noise_threshold": None,
}

disable_progress_bar = True

# Perform feature extraction with custom settings
features = extract_features(df, default_fc_parameters=custom, column_id="id", column_sort="index", column_value="rtt", disable_progressbar=disable_progress_bar)

print(features)