:parameters
ENGINE_ID: Engine ID column

In [40]:
# Packages module Code
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score  #Scikit Learn
import os
import random
import argparse
import requests
from tqdm import tqdm  #tqdm is used for progress bar

In [41]:
#Saving a csv file function
def save_csv(s_data,file_name):
    s_data.to_csv('{}.csv'.format(file_name), header=True, index=False) 


In [42]:
#Parser arguments for running in docker
parser = argparse.ArgumentParser(description="Run Data Preprocessor.")

parser.add_argument(
    "--engine_percentage_initial",
    type=int,
    help="Percentage of train engines used for initial model training.",
    default=os.environ.get("ENGINE_PERCENTAGE_INITIAL", 10),
)

parser.add_argument(
    "--engine_percentage_val",
    type=int,
    help="Percentage of test engines used for cross validation.",
    default=os.environ.get("ENGINE_PERCENTAGE_VAL", 50),
)

parser.add_argument(
    "--worker_count",
    type=int,
    help="Number of workers/Child nodes used.",
    default=os.environ.get("WORKER_COUNT", 6),
)

_StoreAction(option_strings=['--worker_count'], dest='worker_count', nargs=None, const=None, default=6, type=<class 'int'>, choices=None, required=False, help='Number of workers/Child nodes used.', metavar=None)

In [43]:
#IMPORTING and assigning TRAIN,TEST and RUL DATA
""" Import the turbofan training and test data and the test RUL values from the data files.
    :param dataset_id: The dataset from turbofan to import
    :return: A matrix with the training dataset, the test dataset and the test rul data
"""
def import_data(dataset_id):
    train_initial_data = pd.read_csv('train_FD{}.txt'.format(dataset_id), sep=' ', header = None)  # Coverting txt file to csv
    test_initial_data = pd.read_csv('test_FD{}.txt'.format(dataset_id), sep=' ', header = None)  # Coverting txt file to csv
    RUL_initial_data = pd.read_csv('RUL_FD{}.txt'.format(dataset_id), sep=' ', header = None)  # Coverting txt file to csv
    return train_initial_data,test_initial_data,RUL_initial_data
train_initial_data,test_initial_data,RUL_values=import_data(str(input()))
#test_initial_data.insert(26, column = "conditions", value = "6")  
test_initial_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735,,
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916,,
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,8130.10,8.4441,0.03,393,2388,100.0,39.08,23.4166,,
3,1,4,0.0042,0.0000,100.0,518.67,642.44,1584.12,1406.42,14.62,...,8132.90,8.3917,0.03,391,2388,100.0,39.00,23.3737,,
4,1,5,0.0014,0.0000,100.0,518.67,642.51,1587.19,1401.92,14.62,...,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.4130,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13091,100,194,0.0049,0.0000,100.0,518.67,643.24,1599.45,1415.79,14.62,...,8213.28,8.4715,0.03,394,2388,100.0,38.65,23.1974,,
13092,100,195,-0.0011,-0.0001,100.0,518.67,643.22,1595.69,1422.05,14.62,...,8210.85,8.4512,0.03,395,2388,100.0,38.57,23.2771,,
13093,100,196,-0.0006,-0.0003,100.0,518.67,643.44,1593.15,1406.82,14.62,...,8217.24,8.4569,0.03,395,2388,100.0,38.62,23.2051,,
13094,100,197,-0.0038,0.0001,100.0,518.67,643.26,1594.99,1419.36,14.62,...,8220.48,8.4711,0.03,395,2388,100.0,38.66,23.2699,,


1. Column 1: Corresponds to engine number (This column is indexed 0 above because of Python's numbering convention)
2. Column 2: Corresponds to cycle number. If engine 1 fails after 192 cycles, the entries of second column for engine 1 will go from 1 to 192. Similarly for other engines.
3. Columns 3,4,5: 3 operational settings
4. Columns 6 to 26: 21 sensor measurements

In [44]:
# remove columns containing NAN values
train_initial_data=train_initial_data.drop([26,27],axis='columns')
test_initial_data=test_initial_data.drop([26,27],axis='columns')
RUL_values=RUL_values.drop([1],axis='columns')
RUL_values

Unnamed: 0,0
0,112
1,98
2,69
3,82
4,91
...,...
95,137
96,82
97,59
98,117


Column 1: ENGINE_ID
Column 2: CYCLE_TIME
Column 3,4,5: operational settings
Column 6 to 25: Sensor measurements

In [45]:
train_initial_data.columns = ['ENGINE_ID', 'Cycle_Time','OpSet1', 'OpSet2', 'OpSet3', 'SensorMeasure1', 'SensorMeasure2', 'SensorMeasure3', 'SensorMeasure4', 'SensorMeasure5', 'SensorMeasure6','SensorMeasure7','SensorMeasure8','SensorMeasure9','SensorMeasure10','SensorMeasure11','SensorMeasure12','SensorMeasure13','SensorMeasure14','SensorMeasure15','SensorMeasure16','SensorMeasure17','SensorMeasure18','SensorMeasure19','SensorMeasure20','SensorMeasure21']
test_initial_data.columns=['ENGINE_ID', 'Cycle_Time','OpSet1', 'OpSet2', 'OpSet3', 'SensorMeasure1', 'SensorMeasure2', 'SensorMeasure3', 'SensorMeasure4', 'SensorMeasure5', 'SensorMeasure6','SensorMeasure7','SensorMeasure8','SensorMeasure9','SensorMeasure10','SensorMeasure11','SensorMeasure12','SensorMeasure13','SensorMeasure14','SensorMeasure15','SensorMeasure16','SensorMeasure17','SensorMeasure18','SensorMeasure19','SensorMeasure20','SensorMeasure21']
test_initial_data

Unnamed: 0,ENGINE_ID,Cycle_Time,OpSet1,OpSet2,OpSet3,SensorMeasure1,SensorMeasure2,SensorMeasure3,SensorMeasure4,SensorMeasure5,...,SensorMeasure12,SensorMeasure13,SensorMeasure14,SensorMeasure15,SensorMeasure16,SensorMeasure17,SensorMeasure18,SensorMeasure19,SensorMeasure20,SensorMeasure21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.10,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0000,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.90,8.3917,0.03,391,2388,100.0,39.00,23.3737
4,1,5,0.0014,0.0000,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.4130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13091,100,194,0.0049,0.0000,100.0,518.67,643.24,1599.45,1415.79,14.62,...,520.69,2388.00,8213.28,8.4715,0.03,394,2388,100.0,38.65,23.1974
13092,100,195,-0.0011,-0.0001,100.0,518.67,643.22,1595.69,1422.05,14.62,...,521.05,2388.09,8210.85,8.4512,0.03,395,2388,100.0,38.57,23.2771
13093,100,196,-0.0006,-0.0003,100.0,518.67,643.44,1593.15,1406.82,14.62,...,521.18,2388.04,8217.24,8.4569,0.03,395,2388,100.0,38.62,23.2051
13094,100,197,-0.0038,0.0001,100.0,518.67,643.26,1594.99,1419.36,14.62,...,521.33,2388.08,8220.48,8.4711,0.03,395,2388,100.0,38.66,23.2699


In [46]:
#Adding RUL to test data
""" Enhance each row in the test data with the RUL. This is done inplace.
    :param test_data: The test data to enhance
    :param test_data_rul: The final RUL values for the engines in the test data
"""
def add_rul_to_data(test_data,data_rul):
    data_rul['ENGINE_ID'] = data_rul.index + 1
    data_rul.columns = ['final_rul', 'ENGINE_ID']
    # retrieve the max cycles in the test data
    test_rul_max = pd.DataFrame(test_data.groupby('ENGINE_ID')['Cycle_Time'].max()).reset_index()
    test_rul_max.columns = ['ENGINE_ID', 'max']

    test_data = test_data.merge(data_rul, on=['ENGINE_ID'], how='left')
    test_data = test_data.merge(test_rul_max, on=['ENGINE_ID'], how='left')

    # add the current RUL for every cycle
    test_data['RUL'] = test_data['max']- test_data['Cycle_Time']    
    test_data.drop(['max', 'final_rul'], axis=1, inplace=True)
    
    #Feature ID for various conditions
    test_data.insert(1,'CONDITION_ID',value=1)
    #test_data['CONDITION_ID'] = 1   (Alternative option)

    return test_data
test_initial_data=add_rul_to_data(test_initial_data,RUL_values)

In [47]:
# retrieve the max cycles per engine: RUL
train_rul = pd.DataFrame(train_initial_data.groupby('ENGINE_ID')['Cycle_Time'].max()).reset_index()
# merge the RULs into the training data
train_rul.columns = ['ENGINE_ID', 'max']
train_initial_data = train_initial_data.merge(train_rul, on=['ENGINE_ID'], how='left')
# add the current RUL for every cycle
train_initial_data['RUL'] = train_initial_data['max'] - train_initial_data['Cycle_Time']
train_initial_data.drop('max', axis=1, inplace=True)

EOL= END OF LIFE CYCLE
RUL= REMAINING USEFUL LIFE
RUL=EOL-CYCLE TIME

In [48]:
#Check for individual ENGINE ID
#f_data[f_data['ENGINE_ID'] == 1]

save_csv(test_initial_data,'test_initial_data')
save_csv(train_initial_data,'train_initial_data')

In [49]:
#SPLIT TRAINING DATA ()
#Divides train_initial_data variable into initial training set for initial training at central node
def split_train_data_by_engines(train_data, engine_percentage_initial, worker_count):
    """ Groups the train data by engines and split it into subsets for initial training and for each worker.
    :param train_data: The full training data set
    :param engine_percentage_initial: The percentage of engines to take for initial training at central node 
    :param worker_count: The number of workers/child node engines to prepare data sets for
    :return: Creates .csv or .txt files to be saved for central node and child nodes
    """
    train_data_per_engines = train_data.groupby('ENGINE_ID')
    train_data_per_engines = [train_data_per_engines.get_group(x) for x in train_data_per_engines.groups]
    random.shuffle(train_data_per_engines)

    # split into data for initial training and data for the worker nodes
    engine_count_initial = int(len(train_data_per_engines) * engine_percentage_initial / 100)
    train_data_initial = pd.concat(train_data_per_engines[:engine_count_initial])
    train_data_worker_all = train_data_per_engines[engine_count_initial:]

    train_data_worker = []
    engine_count_worker = int((len(train_data_per_engines) - engine_count_initial) / worker_count)

    # split worker data into the data sets for every single worker
    for i in range(worker_count):
        start = i * engine_count_worker
        end = start + engine_count_worker
        train_data_worker.append(pd.concat(train_data_worker_all[start:end]))

    return train_data_initial, train_data_worker