## Extract FL Data

In [89]:
import numpy as np
import pandas as pd

In [90]:
train_df = pd.read_csv('../../data_analysis/fd003/fd003-raw_train.csv', sep=' ')
test_df = pd.read_csv('../../data_analysis/fd003/fd003-raw_test.csv', sep=' ')
train_labels_df = pd.read_csv('../../data_analysis/fd003/fd003-training_labels.csv', sep=' ')
test_labels_df = pd.read_csv('../../data_analysis/fd003/fd003-testing_labels.csv', sep=' ')
test_labels_at_break_df = pd.read_csv('../../TED/CMAPSSData/RUL_FD003.txt', sep = ' ', header = None)

In [91]:
columns = train_df.columns
ms_used = []
for i in range(1, 22):
    if i not in [1, 5, 6, 9, 10, 14, 16, 18, 19]:
        ms_used.append('SensorMeasure' + str(i))

In [92]:
test_at_break_df = test_df.groupby(['ID']).last().reset_index()
test_labels_at_break_df.columns = ['RUL', 'NaN']
test_labels_at_break_df.drop(columns = ['NaN'], inplace = True)
train_labels_df[ms_used] = train_df[ms_used]
test_labels_at_break_df[ms_used] = test_at_break_df[ms_used]
train_labels_df['ID'] = train_df['ID']
test_labels_at_break_df['ID'] = test_at_break_df['ID']
train_df = train_labels_df.copy()
test_df = test_labels_at_break_df.copy()
train_df['RUL'] = train_labels_df['RUL'].clip(upper=125)

In [93]:
train_df

Unnamed: 0,RUL,SensorMeasure2,SensorMeasure3,SensorMeasure4,SensorMeasure7,SensorMeasure8,SensorMeasure11,SensorMeasure12,SensorMeasure13,SensorMeasure15,SensorMeasure17,SensorMeasure20,SensorMeasure21,ID
0,125,642.36,1583.23,1396.84,553.97,2387.96,47.30,522.31,2388.01,8.4246,391,39.11,23.3537,1
1,125,642.50,1584.69,1396.89,554.55,2388.00,47.23,522.42,2388.03,8.4403,392,38.99,23.4491,1
2,125,642.18,1582.35,1405.61,554.43,2388.03,47.22,522.03,2388.00,8.3901,391,38.85,23.3669,1
3,125,642.92,1585.61,1392.27,555.21,2388.00,47.24,522.49,2388.08,8.3878,392,38.96,23.2951,1
4,125,641.68,1588.63,1397.65,554.74,2388.04,47.15,522.58,2388.03,8.3869,392,39.14,23.4583,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24715,4,643.78,1596.01,1424.11,551.86,2388.25,48.27,519.66,2388.30,8.5036,394,38.44,22.9631,100
24716,3,643.29,1596.38,1429.14,551.86,2388.23,48.13,519.91,2388.28,8.5174,395,38.50,22.9746,100
24717,2,643.84,1604.53,1431.41,551.30,2388.25,48.18,519.44,2388.24,8.5223,396,38.39,23.0682,100
24718,1,643.94,1597.56,1426.57,550.69,2388.26,48.05,520.01,2388.26,8.5148,395,38.31,23.0753,100


In [94]:
train_gb = train_df.groupby(['ID'], sort = False)
test_gb = test_df.groupby(['ID'], sort = False)

In [95]:
# Args: List of pcts -> ex: [0.75, 0.2, 0.05]
def split_data(pcts, mode = 'train'):

    gb = train_gb if mode == 'train' else test_gb

    idx_remaining = [i for i in range(1, len(gb) + 1)]
    nums = []
    indices_concat = []
    for pct in pcts:
        nums.append(int(pct * len(gb)))
    n_workers = len(pcts)

    for i in range(n_workers):
        idx_worker = np.sort(np.random.choice(idx_remaining, nums[i], replace = False))
        indices_concat.append(idx_worker) 
        idx_remaining = np.setdiff1d(idx_remaining, idx_worker)

    for i in range(len(indices_concat)):
        id_list = indices_concat[i]
        df = gb.get_group(id_list[0])
        for j in range(1, len(id_list)):
            df = pd.concat([df, gb.get_group(id_list[j])])
        folder_name = str('')
        for pct in pcts:
            folder_name += str(int(pct * 100))
            folder_name += '-'
        folder_name = folder_name[:-1]
        # df.to_csv('./tf/fd003/raw/' + str(n_workers) + ' workers/' + folder_name + '/' + mode + '_partition_' + str(i) + '.csv', sep=',', index = False)
        new_cols = ['y'] + ['x' + str(num) for num in range(len(ms_used))]
        df.columns = new_cols + ['ID']
        df[new_cols].to_csv('./decision-trees/fd003/raw/' + str(n_workers) + ' workers/' + folder_name + '/' + mode + '_partition_' + str(i) + '.csv', sep=',', index_label='id')

pct_list_total = [
    [0.5, 0.5], 
    [0.6, 0.4], 
    [0.7, 0.3], 
    [0.8, 0.2], 
    [0.9, 0.1]
    ]
pct_list_total.extend([
    [0.4, 0.3, 0.3], 
    [0.4, 0.4, 0.2], 
    [0.5, 0.4, 0.1], 
    [0.6, 0.3, 0.1], 
    [0.7, 0.2, 0.1], 
    [0.8, 0.1, 0.1]
    ])
pct_list_total.extend([
    [0.3, 0.3, 0.2, 0.2], 
    [0.3, 0.3, 0.3, 0.1],  
    [0.4, 0.3, 0.2, 0.1], 
    [0.4, 0.4, 0.1, 0.1], 
    [0.5, 0.3, 0.1, 0.1],
    [0.6, 0.2, 0.1, 0.1],
    [0.7, 0.1, 0.1, 0.1]
    ])

pct_list_total.extend([
    [0.2, 0.2, 0.2, 0.2, 0.2], 
    [0.3, 0.2, 0.2, 0.2, 0.1], 
    [0.3, 0.3, 0.2, 0.1, 0.1], 
    [0.4, 0.3, 0.1, 0.1, 0.1], 
    [0.5, 0.2, 0.1, 0.1, 0.1], 
    [0.6, 0.1, 0.1, 0.1, 0.1]
    ])

for pct_list in pct_list_total:
    print(pct_list)
    split_data(pct_list, mode = 'train')
    split_data(pct_list, mode = 'test')

[0.5, 0.5]
[0.6, 0.4]
[0.7, 0.3]
[0.8, 0.2]
[0.9, 0.1]
[0.4, 0.3, 0.3]
[0.4, 0.4, 0.2]
[0.5, 0.4, 0.1]
[0.6, 0.3, 0.1]
[0.7, 0.2, 0.1]
[0.8, 0.1, 0.1]
[0.3, 0.3, 0.2, 0.2]
[0.3, 0.3, 0.3, 0.1]
[0.4, 0.3, 0.2, 0.1]
[0.4, 0.4, 0.1, 0.1]
[0.5, 0.3, 0.1, 0.1]
[0.6, 0.2, 0.1, 0.1]
[0.7, 0.1, 0.1, 0.1]
[0.2, 0.2, 0.2, 0.2, 0.2]
[0.3, 0.2, 0.2, 0.2, 0.1]
[0.3, 0.3, 0.2, 0.1, 0.1]
[0.4, 0.3, 0.1, 0.1, 0.1]
[0.5, 0.2, 0.1, 0.1, 0.1]
[0.6, 0.1, 0.1, 0.1, 0.1]


### Test data - convert format

In [96]:
train_df

Unnamed: 0,RUL,SensorMeasure2,SensorMeasure3,SensorMeasure4,SensorMeasure7,SensorMeasure8,SensorMeasure11,SensorMeasure12,SensorMeasure13,SensorMeasure15,SensorMeasure17,SensorMeasure20,SensorMeasure21,ID
0,125,642.36,1583.23,1396.84,553.97,2387.96,47.30,522.31,2388.01,8.4246,391,39.11,23.3537,1
1,125,642.50,1584.69,1396.89,554.55,2388.00,47.23,522.42,2388.03,8.4403,392,38.99,23.4491,1
2,125,642.18,1582.35,1405.61,554.43,2388.03,47.22,522.03,2388.00,8.3901,391,38.85,23.3669,1
3,125,642.92,1585.61,1392.27,555.21,2388.00,47.24,522.49,2388.08,8.3878,392,38.96,23.2951,1
4,125,641.68,1588.63,1397.65,554.74,2388.04,47.15,522.58,2388.03,8.3869,392,39.14,23.4583,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24715,4,643.78,1596.01,1424.11,551.86,2388.25,48.27,519.66,2388.30,8.5036,394,38.44,22.9631,100
24716,3,643.29,1596.38,1429.14,551.86,2388.23,48.13,519.91,2388.28,8.5174,395,38.50,22.9746,100
24717,2,643.84,1604.53,1431.41,551.30,2388.25,48.18,519.44,2388.24,8.5223,396,38.39,23.0682,100
24718,1,643.94,1597.56,1426.57,550.69,2388.26,48.05,520.01,2388.26,8.5148,395,38.31,23.0753,100
