In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib import gridspec

In [2]:
#IMPORTING and assigning TRAIN,TEST and RUL DATA
""" Import the turbofan training and test data and the test RUL values from the data files.
    :param dataset_id: The dataset from turbofan to import
    :return: A matrix with the training dataset, the test dataset and the test rul data
"""
def import_data(dataset_id):
    train_initial_data = pd.read_csv('train_FD{}.txt'.format(dataset_id), sep=' ', header = None)  # Coverting txt file to csv
    test_initial_data = pd.read_csv('test_FD{}.txt'.format(dataset_id), sep=' ', header = None)  # Coverting txt file to csv
    RUL_initial_data = pd.read_csv('RUL_FD{}.txt'.format(dataset_id), sep=' ', header = None)  # Coverting txt file to csv
    return train_initial_data,test_initial_data,RUL_initial_data
train_initial_data,test_initial_data,RUL_values=import_data(str(input()))  


In [3]:
# remove columns containing NAN values
train_initial_data=train_initial_data.drop([26,27],axis='columns')
test_initial_data=test_initial_data.drop([26,27],axis='columns')

train_initial_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640


In [4]:
train_initial_data.columns = ['ENGINE_ID', 'Cycle_Time', 'OpSet1', 'OpSet2', 'OpSet3', 'SensorMeasure1', 'SensorMeasure2', 'SensorMeasure3', 'SensorMeasure4', 'SensorMeasure5', 'SensorMeasure6','SensorMeasure7','SensorMeasure8','SensorMeasure9','SensorMeasure10','SensorMeasure11','SensorMeasure12','SensorMeasure13','SensorMeasure14','SensorMeasure15','SensorMeasure16','SensorMeasure17','SensorMeasure18','SensorMeasure19','SensorMeasure20','SensorMeasure21']
test_initial_data.columns=['ENGINE_ID', 'Cycle_Time', 'OpSet1', 'OpSet2', 'OpSet3', 'SensorMeasure1', 'SensorMeasure2', 'SensorMeasure3', 'SensorMeasure4', 'SensorMeasure5', 'SensorMeasure6','SensorMeasure7','SensorMeasure8','SensorMeasure9','SensorMeasure10','SensorMeasure11','SensorMeasure12','SensorMeasure13','SensorMeasure14','SensorMeasure15','SensorMeasure16','SensorMeasure17','SensorMeasure18','SensorMeasure19','SensorMeasure20','SensorMeasure21']
train_initial_data

Unnamed: 0,ENGINE_ID,Cycle_Time,OpSet1,OpSet2,OpSet3,SensorMeasure1,SensorMeasure2,SensorMeasure3,SensorMeasure4,SensorMeasure5,...,SensorMeasure12,SensorMeasure13,SensorMeasure14,SensorMeasure15,SensorMeasure16,SensorMeasure17,SensorMeasure18,SensorMeasure19,SensorMeasure20,SensorMeasure21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640


In [5]:
train_initial_data.describe()

Unnamed: 0,ENGINE_ID,Cycle_Time,OpSet1,OpSet2,OpSet3,SensorMeasure1,SensorMeasure2,SensorMeasure3,SensorMeasure4,SensorMeasure5,...,SensorMeasure12,SensorMeasure13,SensorMeasure14,SensorMeasure15,SensorMeasure16,SensorMeasure17,SensorMeasure18,SensorMeasure19,SensorMeasure20,SensorMeasure21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0,518.67,642.680934,1590.523119,1408.933782,14.62,...,521.41347,2388.096152,8143.752722,8.442146,0.03,393.210654,2388.0,100.0,38.816271,23.289705
std,29.227633,68.88099,0.002187,0.000293,0.0,0.0,0.500053,6.13115,9.000605,1.7764e-15,...,0.737553,0.071919,19.076176,0.037505,1.3878120000000003e-17,1.548763,0.0,0.0,0.180746,0.108251
min,1.0,1.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,...,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,26.0,52.0,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,...,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,52.0,104.0,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,...,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,77.0,156.0,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,...,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,...,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


In [6]:
train_initial_data=train_initial_data.drop(['ENGINE_ID','Cycle_Time'],axis=1)
test_initial_data=test_initial_data.drop(['ENGINE_ID','Cycle_Time'],axis=1)
train_initial_data

Unnamed: 0,OpSet1,OpSet2,OpSet3,SensorMeasure1,SensorMeasure2,SensorMeasure3,SensorMeasure4,SensorMeasure5,SensorMeasure6,SensorMeasure7,...,SensorMeasure12,SensorMeasure13,SensorMeasure14,SensorMeasure15,SensorMeasure16,SensorMeasure17,SensorMeasure18,SensorMeasure19,SensorMeasure20,SensorMeasure21
0,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,21.61,554.36,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,21.61,554.26,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.00,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,21.61,551.43,...,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,21.61,550.86,...,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594
20628,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,21.61,550.94,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,21.61,550.68,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640


In [7]:
from sklearn import preprocessing

# Train dataset is scaled so that values are in the range 0 to 1
scaler = preprocessing.MinMaxScaler()
train_data_scaled = pd.DataFrame(scaler.fit_transform(train_initial_data), 
                              columns=train_initial_data.columns, 
                              index=train_initial_data.index)
train_data_scaled.describe()

Unnamed: 0,OpSet1,OpSet2,OpSet3,SensorMeasure1,SensorMeasure2,SensorMeasure3,SensorMeasure4,SensorMeasure5,SensorMeasure6,SensorMeasure7,...,SensorMeasure12,SensorMeasure13,SensorMeasure14,SensorMeasure15,SensorMeasure16,SensorMeasure17,SensorMeasure18,SensorMeasure19,SensorMeasure20,SensorMeasure21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,0.49949,0.501959,0.0,0.0,0.443052,0.424746,0.450435,0.0,0.980321,0.566459,...,0.580697,0.317871,0.226095,0.451118,0.0,0.434221,0.0,0.0,0.524241,0.546127
std,0.125708,0.244218,0.0,0.0,0.150618,0.133664,0.151935,0.0,0.138898,0.142527,...,0.157261,0.105763,0.098442,0.144306,0.0,0.129064,0.0,0.0,0.140114,0.149476
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.413793,0.333333,0.0,0.0,0.335843,0.331807,0.339467,0.0,1.0,0.476651,...,0.484009,0.235294,0.17187,0.346287,0.0,0.333333,0.0,0.0,0.434109,0.452361
50%,0.5,0.5,0.0,0.0,0.430723,0.415522,0.435348,0.0,1.0,0.5781,...,0.594883,0.308824,0.209516,0.43863,0.0,0.416667,0.0,0.0,0.534884,0.557443
75%,0.586207,0.75,0.0,0.0,0.539157,0.508829,0.545324,0.0,1.0,0.669887,...,0.695096,0.382353,0.249613,0.541362,0.0,0.5,0.0,0.0,0.627907,0.652582
max,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0


In [8]:
train_data_scaled

Unnamed: 0,OpSet1,OpSet2,OpSet3,SensorMeasure1,SensorMeasure2,SensorMeasure3,SensorMeasure4,SensorMeasure5,SensorMeasure6,SensorMeasure7,...,SensorMeasure12,SensorMeasure13,SensorMeasure14,SensorMeasure15,SensorMeasure16,SensorMeasure17,SensorMeasure18,SensorMeasure19,SensorMeasure20,SensorMeasure21
0,0.459770,0.166667,0.0,0.0,0.183735,0.406802,0.309757,0.0,1.0,0.726248,...,0.633262,0.205882,0.199608,0.363986,0.0,0.333333,0.0,0.0,0.713178,0.724662
1,0.609195,0.250000,0.0,0.0,0.283133,0.453019,0.352633,0.0,1.0,0.628019,...,0.765458,0.279412,0.162813,0.411312,0.0,0.333333,0.0,0.0,0.666667,0.731014
2,0.252874,0.750000,0.0,0.0,0.343373,0.369523,0.370527,0.0,1.0,0.710145,...,0.795309,0.220588,0.171793,0.357445,0.0,0.166667,0.0,0.0,0.627907,0.621375
3,0.540230,0.500000,0.0,0.0,0.343373,0.256159,0.331195,0.0,1.0,0.740741,...,0.889126,0.294118,0.174889,0.166603,0.0,0.333333,0.0,0.0,0.573643,0.662386
4,0.390805,0.333333,0.0,0.0,0.349398,0.257467,0.404625,0.0,1.0,0.668277,...,0.746269,0.235294,0.174734,0.402078,0.0,0.416667,0.0,0.0,0.589147,0.704502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,0.477011,0.250000,0.0,0.0,0.686747,0.587312,0.782917,0.0,1.0,0.254428,...,0.170576,0.558824,0.194344,0.656791,0.0,0.750000,0.0,0.0,0.271318,0.109500
20627,0.408046,0.083333,0.0,0.0,0.701807,0.729453,0.866475,0.0,1.0,0.162641,...,0.211087,0.500000,0.188668,0.727203,0.0,0.583333,0.0,0.0,0.124031,0.366197
20628,0.522989,0.500000,0.0,0.0,0.665663,0.684979,0.775321,0.0,1.0,0.175523,...,0.281450,0.529412,0.212148,0.922278,0.0,0.833333,0.0,0.0,0.232558,0.053991
20629,0.436782,0.750000,0.0,0.0,0.608434,0.746021,0.747468,0.0,1.0,0.133655,...,0.208955,0.514706,0.203065,0.823394,0.0,0.583333,0.0,0.0,0.116279,0.234466


In [9]:
test_data_scaled = pd.DataFrame(scaler.transform(test_initial_data), 
                              columns=test_initial_data.columns, 
                              index=test_initial_data.index)
test_data_scaled.describe()

Unnamed: 0,OpSet1,OpSet2,OpSet3,SensorMeasure1,SensorMeasure2,SensorMeasure3,SensorMeasure4,SensorMeasure5,SensorMeasure6,SensorMeasure7,...,SensorMeasure12,SensorMeasure13,SensorMeasure14,SensorMeasure15,SensorMeasure16,SensorMeasure17,SensorMeasure18,SensorMeasure19,SensorMeasure20,SensorMeasure21
count,13096.0,13096.0,13096.0,13096.0,13096.0,13096.0,13096.0,13096.0,13096.0,13096.0,...,13096.0,13096.0,13096.0,13096.0,13096.0,13096.0,13096.0,13096.0,13096.0,13096.0
mean,0.499358,0.503532,0.0,0.0,0.381051,0.371903,0.379564,0.0,0.970067,0.629231,...,0.651967,0.280919,0.201299,0.388395,0.0,0.380969,0.0,0.0,0.583335,0.609697
std,0.126591,0.245025,0.0,0.0,0.120753,0.109075,0.112902,0.0,0.170408,0.109708,...,0.119323,0.083727,0.052578,0.111617,0.0,0.102798,0.0,0.0,0.10983,0.116156
min,0.028736,0.0,0.0,0.0,-0.024096,-0.043601,0.036124,0.0,0.0,0.165862,...,0.147122,0.014706,0.044174,0.030396,0.0,0.083333,0.0,0.0,0.131783,0.05689
25%,0.413793,0.333333,0.0,0.0,0.29744,0.295618,0.298785,0.0,1.0,0.557166,...,0.573561,0.220588,0.167045,0.310504,0.0,0.333333,0.0,0.0,0.511628,0.534935
50%,0.5,0.5,0.0,0.0,0.376506,0.369523,0.374578,0.0,1.0,0.636071,...,0.658849,0.279412,0.198421,0.384763,0.0,0.416667,0.0,0.0,0.589147,0.614471
75%,0.586207,0.75,0.0,0.0,0.460843,0.443046,0.452397,0.0,1.0,0.706924,...,0.73774,0.338235,0.229229,0.459407,0.0,0.416667,0.0,0.0,0.658915,0.689589
max,0.948276,1.083333,0.0,0.0,0.930723,0.795945,0.862762,0.0,1.0,0.964573,...,1.081023,0.647059,0.622046,0.833013,0.0,0.75,0.0,0.0,0.984496,1.03245


In [10]:
from sklearn.decomposition import PCA

#n_components = 24 # How many dimensions you want to reduce to
#pca = PCA(n_components=n_components, svd_solver= 'full')
pca=PCA(0.95)
train_data_PCA = pca.fit_transform(train_data_scaled)
train_data_PCA.shape

(20631, 11)

In [11]:
pca.explained_variance_ratio_

array([0.50978998, 0.17019244, 0.06148257, 0.05277026, 0.04511109,
       0.02296592, 0.02089122, 0.01896348, 0.01777373, 0.01674084,
       0.01497876])

In [12]:
train_data_PCA = pd.DataFrame(train_data_PCA)
train_data_PCA.index = train_data_scaled.index
train_data_PCA

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.410290,0.329588,-0.062926,-0.034272,0.039837,0.150101,-0.061206,-0.044378,-0.039456,0.066469,0.060335
1,-0.334079,0.245318,-0.083213,-0.020121,-0.109669,0.088208,-0.113706,-0.072674,-0.013043,0.068331,0.007763
2,-0.415501,-0.251669,-0.054831,-0.033593,0.246061,-0.010257,-0.056753,0.078662,0.145056,0.057986,0.003087
3,-0.517311,-0.005695,-0.087794,-0.027715,-0.042761,-0.058995,0.027378,0.043045,0.011939,-0.166043,-0.041628
4,-0.345767,0.164130,-0.043195,-0.036834,0.104798,-0.030646,0.082129,-0.092327,-0.030043,0.006404,-0.026205
...,...,...,...,...,...,...,...,...,...,...,...
20626,0.982341,0.263587,-0.082210,0.061408,0.022039,0.032126,0.106864,0.125273,-0.088176,-0.068342,0.017894
20627,0.998340,0.430238,-0.094181,0.062383,0.095656,0.053460,-0.070816,-0.122285,0.134580,-0.059662,0.052078
20628,1.127928,0.014627,-0.039194,0.060573,-0.021783,0.121446,0.100898,0.076342,-0.110534,0.092261,-0.168622
20629,1.110000,-0.234838,-0.066442,0.066932,0.070271,0.171857,-0.007462,-0.038330,0.150667,0.013316,-0.006851


In [13]:
test_data_PCA = pca.transform(test_data_scaled)
test_data_PCA = pd.DataFrame(test_data_PCA)
test_data_PCA.index = test_data_scaled.index
test_data_PCA

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.330564,-0.253382,-0.078112,-0.013062,-0.131955,-0.198630,-0.038849,0.006940,0.035415,-0.066983,-0.067194
1,-0.513213,0.246559,-0.014673,-0.054873,0.153201,0.153750,-0.026192,0.019489,-0.104102,-0.089667,-0.003584
2,-0.317142,-0.086241,-0.040033,-0.030177,-0.017871,-0.034501,-0.026490,-0.076152,-0.103111,0.122481,-0.006456
3,-0.375733,-0.005388,-0.104104,-0.009910,-0.242595,-0.052442,-0.008140,0.036093,0.044614,-0.020611,0.144782
4,-0.467751,-0.005154,-0.085730,-0.023812,-0.082489,-0.073847,-0.099955,0.017419,0.118853,0.026453,0.055897
...,...,...,...,...,...,...,...,...,...,...,...
13091,0.329205,0.009542,0.529528,-0.131755,-0.289280,-0.033650,-0.070567,0.021678,0.075252,0.006064,0.010290
13092,0.322065,0.094206,0.477185,-0.128587,0.056295,-0.061627,-0.000807,-0.099048,0.032829,-0.112314,0.051920
13093,0.268365,0.261060,0.502975,-0.139369,0.023417,-0.145682,0.039743,0.016285,0.003701,-0.070207,-0.085453
13094,0.297296,-0.071000,0.536374,-0.148543,0.210750,-0.087135,0.013142,-0.083938,0.005360,-0.011733,0.014824


In [14]:


print(pca.explained_variance_ratio_)

[0.50978998 0.17019244 0.06148257 0.05277026 0.04511109 0.02296592
 0.02089122 0.01896348 0.01777373 0.01674084 0.01497876]


In [15]:
# Compute all PCA components FOR THE scaled TRAINING SET
train_FD001_PCA = pca.fit_transform(train_initial_data)
train_FD001_PCA = pd.DataFrame(train_FD001_PCA)
train_FD001_PCA.index = train_data_scaled.index

# Project the scaled TEST SET onto the PCA space
test_FD001_PCA = pca.transform(test_initial_data)
test_FD001_PCA = pd.DataFrame(test_FD001_PCA)
test_FD001_PCA.index = test_data_scaled.index