In [1]:
%load_ext lab_black

In [38]:
import zipfile
from pathlib import Path
from typing import Callable, Dict, Iterator, List, NamedTuple, Union
import numpy as np
import pandas as pd


def extract_dataframes(
    file_train, file_test, file_rul, subset="FD001", validation=0.00
):
    """Extract train, validation and test dataframe from source file.

    Parameters
    ----------
    file_train : str
        Training samples file.
    file_test : str
        Test samples file.
    subset: str, optional
        Subset. Either 'FD001' or 'FD002' or 'FD003' or 'FD004'.
    validation : float, optional
        Ratio of training samples to hold out for validation.

    Returns
    -------
    (DataFrame, DataFrame, DataFrame)
        Train dataframe, validation dataframe, test dataframe.
    """
    assert subset in ["FD001", "FD002", "FD003", "FD004"], (
        "'subset' must be either 'FD001' or 'FD002' or 'FD003' or 'FD004', got '"
        + subset
        + "'."
    )

    assert 0 <= validation <= 1, (
        "'validation' must be a value within [0, 1], got %.2f" % validation + "."
    )

    df = _load_data_from_file(file_train, subset=subset)

    # group by trajectory
    grouped = df.groupby("trajectory_id")

    df_train = []
    df_validation = []
    for _, traj in grouped:
        traj = traj.assign(rul=traj.index[-1] - traj.index)
        # randomize train/validation splitting
        if np.random.rand() <= (validation + 0.1) and len(df_validation) < round(
            len(grouped) * validation
        ):
            df_validation.append(traj)
        else:
            df_train.append(traj)

    # print info
    print("Number of training trajectories = " + str(len(df_train)))
    print("Number of validation trajectories = " + str(len(df_validation)))

    df_train = pd.concat(df_train)

    if len(df_validation) > 0:
        df_validation = pd.concat(df_validation)

    df_test = _load_data_from_file(file_test, subset=subset)
    rul = np.asarray(file_rul.readlines(), dtype=np.int32)
    cumul = []
    for traj_id, traj in df_test.groupby("trajectory_id"):
        cumul.append(traj.assign(rul=rul[traj_id - 1] - (traj.index - traj.index[0])))
    df_test = pd.concat(cumul)

    print("Done.")
    return df_train, df_validation, df_test


def _load_data_from_file(file, subset="FD001"):
    """Load data from source file into a dataframe.

    Parameters
    ----------
    file : str
        Source file.
    subset: str, optional
        Subset. Either 'FD001' or 'FD002' or 'FD003' or 'FD004'.

    Returns
    -------
    DataFrame
        Data organized into a dataframe.
    """
    assert subset in ["FD001", "FD002", "FD003", "FD004"], (
        "'subset' must be either 'FD001' or 'FD002' or 'FD003' or 'FD004', got '"
        + subset
        + "'."
    )

    n_operational_settings = 3
    n_sensors = 21

    # read csv
    df = pd.read_csv(file, sep=" ", header=None, index_col=False).fillna(method="bfill")
    df = df.dropna(axis="columns", how="all")

    assert (
        df.shape[1] == n_operational_settings + n_sensors + 2
    ), "Expected %d columns, got %d." % (
        n_operational_settings + n_sensors + 2,
        df.shape[1],
    )

    df.columns = (
        ["trajectory_id", "t"]
        + ["setting_" + str(i + 1) for i in range(n_operational_settings)]
        + ["sensor_" + str(i + 1) for i in range(n_sensors)]
    )

    # drop t
    df = df.drop(["t"], axis=1)

    if subset in ["FD001", "FD003"]:
        # drop operating_modes
        df = df.drop(
            ["setting_" + str(i + 1) for i in range(n_operational_settings)], axis=1
        )

        # drop sensors which are useless according to the literature
        to_drop = [1, 5, 6, 10, 16, 18, 19]
        df = df.drop(["sensor_" + str(d) for d in to_drop], axis=1)

    return df


def generate_parquet(args):
    for subset, window in [("FD001", 30), ("FD002", 20), ("FD003", 30), ("FD004", 15)]:
        print("**** %s ****" % subset)
        print("normalization = " + args.normalization)
        print("window = " + str(window))
        print("validation = " + str(args.validation))

        # read .zip file into memory
        with zipfile.ZipFile("data/cmapss/CMAPSSData.zip") as zip_file:
            file_train = zip_file.open("train_" + subset + ".txt")
            file_test = zip_file.open("test_" + subset + ".txt")
            file_rul = zip_file.open("RUL_" + subset + ".txt")

        print("Extracting dataframes...")
        df_train, df_val, df_test = extract_dataframes(
            file_train=file_train,
            file_test=file_test,
            file_rul=file_rul,
            subset=subset,
            validation=args.validation,
        )

        print("Generating parquet files...")
        path = Path(args.out_path, "parquet")
        if not path.exists():
            os.makedirs(path)
        for df, prefix in zip([df_train, df_val, df_test], ["train", "val", "test"]):
            if isinstance(df, pd.DataFrame):
                df.to_parquet(f"{path}/{prefix}_{subset}.parquet")


from types import SimpleNamespace

args = SimpleNamespace(out_path="data/cmapss/", normalization="min-max", validation=0.2)
generate_parquet(args)

**** FD001 ****
normalization = min-max
window = 30
validation = 0.2
Extracting dataframes...
Number of training trajectories = 80
Number of validation trajectories = 20
Done.
Generating parquet files...
**** FD002 ****
normalization = min-max
window = 20
validation = 0.2
Extracting dataframes...
Number of training trajectories = 208
Number of validation trajectories = 52
Done.
Generating parquet files...
**** FD003 ****
normalization = min-max
window = 30
validation = 0.2
Extracting dataframes...
Number of training trajectories = 80
Number of validation trajectories = 20
Done.
Generating parquet files...
**** FD004 ****
normalization = min-max
window = 15
validation = 0.2
Extracting dataframes...
Number of training trajectories = 199
Number of validation trajectories = 50
Done.
Generating parquet files...


In [12]:
from pathlib import Path

out_path = "data/cmapss/"
list(Path(f"{out_path}/parquet").glob("*.parquet"))

[PosixPath('data/cmapss/parquet/val_FD002.parquet'),
 PosixPath('data/cmapss/parquet/train_FD002.parquet'),
 PosixPath('data/cmapss/parquet/val_FD004.parquet'),
 PosixPath('data/cmapss/parquet/test_FD001.parquet'),
 PosixPath('data/cmapss/parquet/test_FD002.parquet'),
 PosixPath('data/cmapss/parquet/train_FD003.parquet'),
 PosixPath('data/cmapss/parquet/val_FD003.parquet'),
 PosixPath('data/cmapss/parquet/train_FD004.parquet'),
 PosixPath('data/cmapss/parquet/val_FD001.parquet'),
 PosixPath('data/cmapss/parquet/train_FD001.parquet'),
 PosixPath('data/cmapss/parquet/test_FD003.parquet'),
 PosixPath('data/cmapss/parquet/test_FD004.parquet')]

In [46]:
df_train = pd.read_parquet(f"{args.out_path}/parquet/train_FD001.parquet")
print(df_train.groupby("trajectory_id").ngroups)
df_train.query("trajectory_id==14")

80


Unnamed: 0,trajectory_id,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,rul
2709,14,642.88,1587.54,1403.23,552.82,2388.09,9058.57,47.46,521.60,2388.09,8137.36,8.4549,392,38.80,23.2687,179
2710,14,642.88,1590.81,1411.67,553.85,2388.11,9055.07,47.43,521.44,2388.07,8141.56,8.4449,393,38.91,23.3892,178
2711,14,642.57,1589.04,1407.39,553.50,2388.10,9062.75,47.61,521.85,2388.04,8140.02,8.4161,393,38.90,23.1882,177
2712,14,643.18,1593.24,1408.20,553.41,2388.10,9056.91,47.54,521.17,2388.12,8138.78,8.4396,393,38.87,23.2598,176
2713,14,642.38,1590.53,1407.83,553.57,2388.12,9057.83,47.54,521.05,2388.09,8142.30,8.4802,393,39.06,23.3189,175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2884,14,643.49,1599.40,1427.06,551.48,2388.18,9125.20,48.08,520.14,2388.20,8191.75,8.5309,396,38.46,23.1078,4
2885,14,644.17,1599.20,1424.67,551.93,2388.21,9128.96,48.04,519.92,2388.18,8193.49,8.5166,397,38.46,23.0187,3
2886,14,643.31,1603.10,1432.46,551.16,2388.24,9132.68,48.26,519.98,2388.20,8194.69,8.4919,397,38.44,23.0785,2
2887,14,643.70,1609.93,1425.93,551.96,2388.24,9132.35,48.30,519.89,2388.18,8189.64,8.5305,396,38.34,23.0647,1


In [47]:
df_validation = pd.read_parquet(f"{args.out_path}/parquet/val_FD001.parquet")
(df_validation.groupby("trajectory_id").ngroups)
df_validation.query("trajectory_id==13")

Unnamed: 0,trajectory_id,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,rul
2546,13,641.94,1594.32,1405.50,554.14,2388.07,9051.55,47.46,520.87,2388.11,8131.99,8.4393,392,38.76,23.3793,162
2547,13,642.61,1592.31,1406.78,553.08,2388.11,9053.39,47.50,521.45,2388.08,8135.24,8.4325,393,38.97,23.2731,161
2548,13,642.23,1589.66,1410.36,553.29,2388.10,9061.71,47.43,521.66,2388.10,8137.34,8.4642,393,38.67,23.2775,160
2549,13,642.67,1593.12,1408.66,553.71,2388.12,9047.99,47.65,521.44,2388.16,8133.29,8.4115,391,38.80,23.3566,159
2550,13,642.77,1589.93,1408.31,553.21,2388.08,9057.89,47.56,521.32,2388.14,8133.30,8.3731,393,39.08,23.4748,158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2704,13,643.53,1612.88,1418.86,551.15,2388.15,9146.79,48.11,520.50,2388.20,8212.93,8.5540,396,38.28,23.1357,4
2705,13,643.77,1597.42,1429.73,551.20,2388.17,9140.98,48.04,519.80,2388.13,8210.94,8.5109,397,38.49,23.2090,3
2706,13,643.50,1602.21,1432.17,551.85,2388.11,9144.17,48.34,520.26,2388.19,8210.25,8.5269,396,38.43,23.0185,2
2707,13,643.87,1605.83,1431.55,551.48,2388.16,9149.29,48.28,520.08,2388.15,8218.44,8.5015,397,38.35,23.1104,1


In [30]:
df_test = pd.read_parquet(f"{args.out_path}/parquet/test_FD001.parquet")
print(df_test.groupby("trajectory_id").ngroups)
df_test.query("trajectory_id==1")

100


Unnamed: 0,trajectory_id,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_17,sensor_20,sensor_21,rul
0,1,643.02,1585.29,1398.21,553.9,2388.04,9050.17,47.2,521.72,2388.03,8125.55,8.4052,392,38.86,23.3735,112
1,1,641.71,1588.45,1395.42,554.85,2388.01,9054.42,47.5,522.16,2388.06,8139.62,8.3803,393,39.02,23.3916,111
2,1,642.46,1586.94,1401.34,554.11,2388.05,9056.96,47.5,521.97,2388.03,8130.1,8.4441,393,39.08,23.4166,110
3,1,642.44,1584.12,1406.42,554.07,2388.03,9045.29,47.28,521.38,2388.05,8132.9,8.3917,391,39.0,23.3737,109
4,1,642.51,1587.19,1401.92,554.16,2388.01,9044.55,47.31,522.15,2388.03,8129.54,8.4031,390,38.99,23.413,108
5,1,642.11,1579.12,1395.13,554.22,2388.0,9050.96,47.26,521.92,2388.08,8127.46,8.4238,392,38.91,23.3467,107
6,1,642.11,1583.34,1404.84,553.89,2388.05,9051.39,47.31,522.01,2388.06,8134.97,8.3914,391,38.85,23.3952,106
7,1,642.54,1580.89,1400.89,553.59,2388.05,9052.86,47.21,522.09,2388.06,8125.93,8.4213,393,39.05,23.3224,105
8,1,641.88,1593.29,1412.28,554.49,2388.06,9048.55,47.37,522.03,2388.05,8134.15,8.4353,391,39.1,23.4521,104
9,1,642.07,1585.25,1398.64,554.28,2388.04,9051.95,47.14,522.0,2388.06,8134.08,8.4093,391,38.87,23.382,103


In [20]:
with zipfile.ZipFile(f"{args.out_path}/CMAPSSData.zip") as zip_file:
    file_rul = zip_file.open("RUL_FD001.txt")
np.asarray(file_rul.readlines(), dtype=np.int32)

array([112,  98,  69,  82,  91,  93,  91,  95, 111,  96,  97, 124,  95,
       107,  83,  84,  50,  28,  87,  16,  57, 111, 113,  20, 145, 119,
        66,  97,  90, 115,   8,  48, 106,   7,  11,  19,  21,  50, 142,
        28,  18,  10,  59, 109, 114,  47, 135,  92,  21,  79, 114,  29,
        26,  97, 137,  15, 103,  37, 114, 100,  21,  54,  72,  28, 128,
        14,  77,   8, 121,  94, 118,  50, 131, 126, 113,  10,  34, 107,
        63,  90,   8,   9, 137,  58, 118,  89, 116, 115, 136,  28,  38,
        20,  85,  55, 128, 137,  82,  59, 117,  20], dtype=int32)