# Predictive Maintenance of Turbofan Engines

## Process Data

The next step is prepare our training and test datasets for training our model.



### Training Data

We import our data as before except we'll save each of the dataframes into a single array of dataframes.

We then need to do normalization of the data so that all our data is on the same scale. 

In [None]:
import pandas as pd

index_names = ['id', 'cycle']
setting_names = ['setting1', 'setting2', 'setting3']
sensor_names = ['s{}'.format(i) for i in range(1,22)] 
columns = index_names + setting_names + sensor_names


In [None]:
def normalize(df):
    eps = 0.000001
    df[columns[2:]]=(df[columns[2:]]-df[columns[2:]].min()+eps)/(df[columns[2:]].max()-df[columns[2:]].min()+eps)
    return df

def calculate_rul(df):
    max_cycle_df = pd.DataFrame(df.groupby('id')['cycle'].max()).reset_index()
    max_cycle_df.columns = ['id', 'max_cycle']
    df = df.merge(max_cycle_df, on=['id'], how='left')
    df['RUL'] = df['max_cycle'] - df['cycle']
    df.drop('max_cycle', axis=1, inplace=True)
    return df


In [None]:
train_df = []

for i in range(1,5):
    df = pd.read_csv('data/train_FD{:03d}.txt'.format(i), delimiter='\s+', header=None, names=columns)
    normalize(df)
    train_df.append(df)

for i, df in enumerate(train_df):
    df = calculate_rul(df)
    train_df[i]=df

### Test data

Next we'll read in the test data combining it with the provided actual RUL values.

In [None]:
test_df = []

for i in range(1,5):
    # Load test data
    df = pd.read_csv('data/test_FD{:03d}.txt'.format(i), delimiter='\s+', header=None, names=columns)

    # Load the RUL values
    df_rul = pd.read_csv('data/RUL_FD{:03d}.txt'.format(i), delimiter='\s+', header=None)
    df_rul.index += 1

    # Merge RUL values with the test data
    df = df.merge(df_rul, left_on=df.columns[0], right_index=True, how='left')
    df.columns = columns + ['RUL_end']
    rul = pd.DataFrame(df.groupby('id')['cycle'].max()).reset_index()
    rul.columns = ['id', 'max']
    df = df.merge(rul, on=['id'], how='left')
    df['RUL'] = df['max'] + df['RUL_end'] - df['cycle']
    df.drop(['max','RUL_end'], axis=1, inplace=True)

    # Normalize
    normalize(df)
    test_df.append(df)

## Save training data to S3

**Set the two variables, `bucket` and `prefix` below before continuing.**

We'll upload all of our data to our S3 bucket so that the SageMaker training instance can access the training data and the test data from that location.


In [None]:
import boto3
import os

bucket = ''
prefix = ''
data_folder = 'data'

s3_bucket_resource = boto3.resource('s3').Bucket(bucket)

# Upload raw data files to S3
for subdir, dirs, files in os.walk(data_folder):
    for file in files:
        full_path = os.path.join(subdir, file)
        s3_path = os.path.join(prefix, full_path)
        s3_bucket_resource.Object(s3_path).upload_file(full_path)

# Upload processed test data for inference
for i in range(len(test_df)):
    local_test_file = 'data/test-{}.csv'.format(i)
    test_df[i].to_csv(local_test_file)
    s3_test_file = os.path.join(prefix, 'data', 'test-{}.csv'.format(i))
    s3_bucket_resource.Object(s3_test_file).upload_file(local_test_file)

# Upload processed data for training
for i in range(len(train_df)):
    local_train_file = 'data/train-{}.csv'.format(i)
    train_df[i].to_csv(local_train_file)
    s3_train_file = os.path.join(prefix, 'train', 'train-{}.csv'.format(i))
    s3_bucket_resource.Object(s3_train_file).upload_file(local_train_file)

s3_train_data = 's3://{}/{}/{}'.format(bucket, prefix, 'train')
print('uploaded training data location: {}'.format(s3_train_data))

The code below will save some of our variables to a file so we can use them in later notebooks.

In [None]:
import pickle

with open('shared_vars', 'wb') as f:
    pickle.dump([bucket, prefix, s3_train_data], f)