In [None]:
import numpy as np
import pandas as pd
import regex as re
import pickle
import matplotlib.pyplot as plt
import os


In [None]:
data_location = os.path.dirname(os.path.dirname(os.path.abspath('__file__'))) + "\\Data"

## Load data


In [None]:
def file_to_df(path):

    data = []

    with open(path) as file:
        lines = file.readlines()
        file_data = [re.sub("\\n", "", j).split(" ")[:26] for j in lines]
        for j in range(len(file_data)):
            file_data[j] = list(map(float, file_data[j]))
        data += file_data
    columns = ["unit", "cycle", "OS1", "OS2", "OS3"] + ["Total temperature at fan inlet", "Total temperature at LPC outlet", "Total temperature at HPC outlet", "Total temperature at LPT outlet", "Pressure at fan inlet", "Total pressure in bypass-duct", "Total pressure at HPC outlet", "Physical fan speed", "Physical core speed", "Engine pressure ratio", "Static pressure at HPC outlet", "Ratio of fuel flow to Ps30", "Corrected fan speed", "Corrected core speed", "Bypass Ratio", "Burner fuel-air ratio", "Bleed Enthalpy", "Demanded fan speed", "Demanded corrected fan speed", "HPT coolant bleed", "LPT coolant bleed"]
    return(pd.DataFrame(data, columns = columns))


In [None]:
train_df = file_to_df(data_location + "/train_FD001.txt")
test_df = file_to_df(data_location + "test_FD001.txt")

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
train_df.head()

## Calculate train RUL

In [None]:
def add_RUL(df, remaining_RUL = None):
    RUL = []
    for c, i in enumerate(df["unit"].unique()):
        count = df.loc[df['unit'] == i].shape[0]
        for j in range(count):
            if remaining_RUL == None:
                RUL.append(max(df.loc[df['unit'] == i]["cycle"]) - j - 1)
            else:
                RUL.append(max(df.loc[df['unit'] == i]["cycle"]) - j - 1 + remaining_RUL[c])
    df["RUL"] = RUL
    return df

In [None]:
train_df = add_RUL(train_df)

In [None]:
train_df["RUL"].head()

In [None]:
pickle.dump(train_df, open('data/train_df', "wb" ))

## Calculate test RUL

In [None]:
with open(data_location + "/RUL_FD001.txt", "r") as file:
    #print((file.readlines()))
    remaining_RUL = [int(re.sub("\\n", "", j)) for j in file.readlines()]

In [None]:
with open(data_location + "/RUL_FD001.txt", "r") as file:
    #print((file.readlines()))
    remaining_RUL = [int(re.sub("\\n", "", j)) for j in file.readlines()]
    
add_RUL(test_df, remaining_RUL)

test_df.head(10)

In [None]:
pickle.dump(test_df, open(data_location + '/test_df', "wb" ))

## Load train and test

In [None]:
train_df = pickle.load(open(data_location + '/train_df', 'rb'))
test_df = pickle.load(open(data_location + '/test_df', 'rb'))

## Removing features

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
#nan values are produced due that the standard deviation can be 0
train_df.corr()

In [None]:
print(train_df.corr()[["Pressure at fan inlet", "Engine pressure ratio", "Burner fuel-air ratio","Demanded fan speed", "Demanded corrected fan speed", "OS3", "Total temperature at fan inlet"]].to_latex())


In [None]:
fig = plt.figure()
ax = plt.subplot(111)
ax.scatter(train_df["Corrected fan speed"], train_df["Physical fan speed"])
plt.xlabel("Corrected fan speed")
plt.ylabel("Physical fan speed")
plt.show()

print("Correlation 'Corrected fan speed' and 'Physical fan speed': ", np.corrcoef((train_df["Corrected fan speed"], train_df["Physical fan speed"]))[0][1])
print("Correlation 'RUL' and 'Physical fan speed': ", np.corrcoef((train_df["RUL"], train_df["Physical fan speed"]))[0][1])
print("Correlation 'Corrected fan speed' and 'RUL': ", np.corrcoef((train_df["Corrected fan speed"], train_df["RUL"]))[0][1])

In [None]:
fig = plt.figure()
ax = plt.subplot(111)
ax.scatter(train_df["Corrected core speed"], train_df["Physical core speed"])
plt.xlabel("Corrected core speed")
plt.ylabel("Physical core speed")
plt.show()

print("Correlation 'Corrected core speed' and 'Physical core speed': ", np.corrcoef((train_df["Corrected core speed"], train_df["Physical core speed"]))[0][1])
print("Correlation 'RUL' and 'Physical core speed': ", np.corrcoef((train_df["RUL"], train_df["Physical core speed"]))[0][1])
print("Correlation 'Corrected core speed' and 'RUL': ", np.corrcoef((train_df["Corrected core speed"], train_df["RUL"]))[0][1])

In [None]:
def remove_columns(df):
    #removing variables with perfect correlation with "Total temperature at fan inlet"
    df = df.drop(columns = ["Pressure at fan inlet", "Engine pressure ratio", "Burner fuel-air ratio"])
    #removing variables with a standard deviation of 0 or zero correlations
    df = df.drop(columns = ["Demanded fan speed", "Demanded corrected fan speed","OS1", "OS2", "OS3", "Total temperature at fan inlet"])
    #removing variable with almost perfect correlation with "Physical core speed"
    df = df.drop(columns = ["Corrected core speed"])
    return df

In [None]:
train_df = remove_columns(train_df)

In [None]:
test_df = remove_columns(test_df)

In [None]:
pickle.dump(train_df, open(data_location + '/train_df', "wb" ))
pickle.dump(test_df, open(data_location + '/test_df', "wb" ))

## EDA

In [None]:
train_df = pickle.load(open(data_location + '/train_df', 'rb'))
test_df = pickle.load(open(data_location + '/test_df', 'rb'))

The motor unit and the maximum RUL seems to be random.

In [None]:
plt.scatter(train_df["unit"].unique(), train_df.groupby(["unit"])["RUL"].max())

In [None]:
plt.boxplot(train_df.groupby(["unit"])["RUL"].max())

In [None]:
train_df[train_df["RUL"]>300].groupby(["unit"]).max()

In [None]:
for column in train_df:
    plt.figure()
    train_df.boxplot([column])

In [None]:
for column in train_df:
    plt.figure()
    plt.xlabel(column)
    plt.scatter(train_df.groupby(["unit"])[column].mean(), train_df.groupby(["unit"])["RUL"].max())

In [None]:
for column in test_df:
    plt.figure()
    plt.xlabel(column)
    plt.scatter(test_df.groupby(["unit"])[column].mean(), test_df.groupby(["unit"])["RUL"].max())

In [None]:
from scipy.stats import ttest_ind

t_values = []
equal = []

for column in train_df:
    t_test = ttest_ind(train_df[column], test_df[column], equal_var = False)
    print("Means of {} are equal: {}".format(column, t_test[1] > 0.05))
    equal.append(t_test[1] > 0.05)
    t_values.append(t_test[0])
    

In [None]:
t_test_df = pd.DataFrame(
{"t-value": t_values , "Equal with p > 0.05" : equal},
index = train_df.columns)
print(t_test_df.to_latex())

In [None]:
fig = plt.figure()
ax = plt.subplot(111)
ax.scatter(test_df["unit"].unique(), test_df.groupby(["unit"])["RUL"].min(), label = "Test")
ax.scatter(train_df["unit"].unique(), train_df.groupby(["unit"])["RUL"].min(), label = "Train")
ax.legend()
plt.xlabel("Unit")
plt.ylabel("Min RUL")
plt.show()

### Partion few engines for demonstration

In [None]:
demo_engines = train_df[train_df.unit.isin(range(96, 101))]
train_df = train_df[train_df.unit.isin(range(96, 101)) == False]
pickle.dump(train_df, open(data_location + '/train_df', "wb" ))
pickle.dump(demo_engines, open(data_location + '/demo_df', "wb" ))