In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
test = pd.read_csv('/content/drive/MyDrive/test_turbo_with_rul.csv')
train = pd.read_csv('/content/drive/MyDrive/train_turbo_with_rul.csv')

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train.info()

In [None]:
train = train.drop(['index', 'source'], axis=1)

In [None]:
train.columns

In [None]:
train.describe().T

In [None]:
test.describe().T

In [None]:
max_time_cycles=train[['engine_no', 'cycles_completed']].groupby('engine_no').max()
plt.figure(figsize=(20,50))
ax=max_time_cycles['cycles_completed'].plot(kind='barh',width=0.8, stacked=True,align='center')
plt.title('Turbofan Engines LifeTime',fontweight='bold',size=30)
plt.xlabel('Time cycle',fontweight='bold',size=20)
plt.xticks(size=15)
plt.ylabel('unit',fontweight='bold',size=20)
plt.yticks(size=15)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#Distribution of maximum time cycles
sns.displot(max_time_cycles['cycles_completed'],kde=True,bins=20,height=6,aspect=2)
plt.xlabel('max time cycle')

In [None]:
# sns.heatmap(train.corr(),annot=True,cmap='RdYlGn')
# fig=plt.gcf()
# fig.set_size_inches(20,20)
# plt.show()

matrix = train.corr()
print(matrix)

In [None]:
# Compute the correlation matrix
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(10, 10))
cmap = sns.diverging_palette(230, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
train.columns

In [None]:
# Your column names
column_names = ['engine_no', 'cycles_completed', 'altitude', 'mach_no',
                'throttle_angle', 'fan_inlet_temp', 'lpc_outlet_temp',
                'hpc_outlet_temp', 'lpt_outlet_temp', 'fan_inlet_pressure',
                'bypass_duct_pressure', 'hpc_outlet_pressure', 'fan_speed',
                'core_speed', 'engine_pressure_ratio', 'hpc_outlet_static_pressure',
                'fuel_ps30_ratio', 'corrected_fan_speed', 'corrected_core_speed',
                'bypass_ratio', 'burner_fuel_air_ratio', 'bleed_enthalpy',
                'demanded_fan_speed', 'demanded_corrected_fan_speed',
                'hpt_coolant_bleed', 'lpt_coolant_bleed', 'RUL']

# Define the Sensor dictionary with the column names
Sensor_dictionary = {
    'fan_inlet_temp': "(Fan inlet temperature) (◦R)",
    'lpc_outlet_temp': "(LPC outlet temperature) (◦R)",
    'hpc_outlet_temp': "(HPC outlet temperature) (◦R)",
    'lpt_outlet_temp': "(LPT outlet temperature) (◦R)",
    'fan_inlet_pressure': "(Fan inlet Pressure) (psia)",
    'bypass_duct_pressure': "(bypass-duct pressure) (psia)",
    'hpc_outlet_pressure': "(HPC outlet pressure) (psia)",
    'fan_speed': "(Physical fan speed) (rpm)",
    'core_speed': "(Physical core speed) (rpm)",
    'engine_pressure_ratio': "(Engine pressure ratio(P50/P2))",
    'hpc_outlet_static_pressure': "(HPC outlet Static pressure) (psia)",
    'fuel_ps30_ratio': "(Ratio of fuel flow to Ps30) (pps/psia)",
    'corrected_fan_speed': "(Corrected fan speed) (rpm)",
    'corrected_core_speed': "(Corrected core speed) (rpm)",
    'bypass_ratio': "(Bypass Ratio)",
    'burner_fuel_air_ratio': "(Burner fuel-air ratio)",
    'bleed_enthalpy': "(Bleed Enthalpy)",
    'demanded_fan_speed': "(Required fan speed)",
    'demanded_corrected_fan_speed': "(Required fan conversion speed)",
    'hpt_coolant_bleed': "(High-pressure turbines Cool air flow)",
    'lpt_coolant_bleed': "(Low-pressure turbines Cool air flow)"
}

def plot_signal(df, Sensor_dic, signal_name):
    plt.figure(figsize=(13, 5))
    for i in df['engine_no'].unique():
        if (i % 10 == 0):  # For a better visualisation, we plot the sensors signals of 20 units only
            plt.plot('RUL', signal_name, data=df[df['engine_no'] == i].rolling(10).mean())

    plt.xlim(250, 0)  # Reverse the x-axis so RUL counts down to zero
    plt.xticks(np.arange(0, 300, 25))
    plt.ylabel(Sensor_dic[signal_name])
    plt.xlabel('Remaining Useful Life')
    plt.show()

# Assuming your data is in a DataFrame named train
for signal in Sensor_dictionary.keys():
    try:
        plot_signal(train, Sensor_dictionary, signal)
    except Exception as e:
        print(f"Could not plot signal {signal}: {e}")

In [None]:
# List of sensor names from your dataset
sensor_names = [
    'fan_inlet_temp', 'lpc_outlet_temp', 'hpc_outlet_temp', 'lpt_outlet_temp',
    'fan_inlet_pressure', 'bypass_duct_pressure', 'hpc_outlet_pressure', 'fan_speed',
    'core_speed', 'engine_pressure_ratio', 'hpc_outlet_static_pressure', 'fuel_ps30_ratio',
    'corrected_fan_speed', 'corrected_core_speed', 'bypass_ratio', 'burner_fuel_air_ratio',
    'bleed_enthalpy', 'demanded_fan_speed', 'demanded_corrected_fan_speed',
    'hpt_coolant_bleed', 'lpt_coolant_bleed'
]

# Plot boxplots for each sensor
for sensor in sensor_names:
    plt.figure(figsize=(15, 8))
    plt.boxplot(train[sensor].dropna(), vert=False)  # Use dropna() to handle any missing values
    plt.title(f'Boxplot of {sensor}')
    plt.xlabel(sensor)
    plt.show()

# # Plot boxplots for each sensor
# for sensor in sensor_names:
#     plt.figure(figsize=(15, 8))
#     plt.boxplot(test[sensor].dropna(), vert=False)  # Use dropna() to handle any missing values
#     plt.title(f'Boxplot of {sensor}')
#     plt.xlabel(sensor)
#     plt.show()

In [None]:
engine_counts = train['engine_no'].value_counts().reset_index()
engine_counts.columns = ['engine', 'count']

# # Drawing bars with seaborn
# plt.figure(figsize=(22, 12))
# sns.barplot(x='count', y='engine', data=engine_counts)
# plt.title('Count of Each Engine')
# plt.xlabel('Count')
# plt.ylabel('Engine Number')
# plt.xticks(rotation=90)
# plt.show()

# Plotting with seaborn countplot
plt.figure(figsize=(22, 20))
sns.countplot(y='engine_no', data=train, order=engine_counts['engine'])
plt.title('Count of Each Engine')
plt.xlabel('Count')
plt.ylabel('Engine Number')
plt.show()

In [None]:
# Get the maximum 'cycle' value for each engine
max_cycle_per_engine = train.groupby('engine_no')['cycles_completed'].max().reset_index()

# Rename columns
max_cycle_per_engine.columns = ['engine_no', 'max_cycle']

# Sort by 'engine' column
max_cycle_per_engine = max_cycle_per_engine.sort_values('engine_no')

# # Drawing bar graphs
# plt.figure(figsize=(20, 15))
# plt.bar(max_cycle_per_engine['engine_no'].astype(str), max_cycle_per_engine['max_cycle'])
# plt.title('Maximum Cycle per Engine')
# plt.xlabel('Engine Number')
# plt.ylabel('Maximum Cycle')
# plt.xticks(rotation=90)
# plt.show()

import pandas as pd

# Define the bins and the corresponding labels
bins = [0, 50, 100, 150, 200, 250, 300, 350, 400]
labels = ['0-50', '51-100', '101-150', '151-200', '201-250', '251-300', '301-350', '351-400']

# Create a new column 'cycle_bin' to categorize 'max_cycle' into bins
max_cycle_per_engine['cycle_bin'] = pd.cut(max_cycle_per_engine['max_cycle'], bins=bins, labels=labels)

# Count the number of engines in each bin
bin_counts = max_cycle_per_engine['cycle_bin'].value_counts().sort_index()

import matplotlib.pyplot as plt

# Drawing bar graphs for binned data
plt.figure(figsize=(12, 8))
plt.bar(bin_counts.index, bin_counts.values)
plt.title('Number of Engines in Each Cycle Range')
plt.xlabel('Cycle Range')
plt.ylabel('Number of Engines')
plt.xticks(rotation=45)
plt.show()

# ML

In [None]:
train.columns

In [None]:
index_names = ['engine_no', 'cycles_completed']

In [None]:
from sklearn.model_selection import train_test_split
drop_labels = index_names
X_train=train.drop(columns=drop_labels).copy()
X_train, X_test, y_train, y_test=train_test_split(X_train,X_train['RUL'], test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

#Droping the target variable
X_train.drop(columns=['RUL'], inplace=True)
X_test.drop(columns=['RUL'], inplace=True)

#Scaling X_train and X_test
X_train_s=scaler.fit_transform(X_train)
X_test_s=scaler.fit_transform(X_test)


# Prepare validation data
drop_labels_valid = index_names + ['remaining_useful_life', 'index_y', 'source', 'index_x']
X_valid = test.drop(columns=drop_labels_valid)
X_valid_s = scaler.transform(X_valid)
y_valid = test['remaining_useful_life']


In [None]:
X_valid

In [None]:
y_valid

In [None]:
print(X_valid_s.shape)
print(y_valid.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train linear regression model
lr = LinearRegression()
lr.fit(X_train_s, y_train)

# Predict on the test set
y_pred_test = lr.predict(X_test_s)

# Evaluate the model
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

# Calculate RMSE for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f"Test RMSE: {rmse_test}")

print(f"Test Mean Squared Error: {mse_test}")
print()
print(f"Test R-squared: {r2_test}")

In [None]:
# Predict on the validation set
y_pred_valid = lr.predict(X_valid_s)

# Evaluate the model on the validation set
mse_valid = mean_squared_error(y_valid, y_pred_valid)
r2_valid = r2_score(y_valid, y_pred_valid)

# Calculate RMSE for the validation set
rmse_valid = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
print(f"Validation RMSE: {rmse_valid}")

print(f"Validation Mean Squared Error: {mse_valid}")

print(f"Validation R-squared: {r2_valid}")

# REMOVING OUTLIERS

In [None]:
import numpy as np
import pandas as pd
from scipy import stats

# Assuming your dataframe is named 'train'
# Calculate Z-scores
z_scores = np.abs(stats.zscore(train.drop(columns=index_names)))

# Set a threshold for Z-scores
threshold = 3

# Identify rows to drop
outliers = np.where(z_scores > threshold)

# Create a mask to identify outliers
mask = (z_scores < threshold).all(axis=1)

# Filter out outliers
train_clean = train[mask]

print("Shape of dataset before dropping outliers:", train.shape)
print("Shape of dataset after dropping outliers:", train_clean.shape)

# Now, use 'train_clean' for further processing


In [None]:
# Splitting the cleaned dataset
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    train_clean.drop(columns=['RUL'] + index_names),
    train_clean['RUL'],
    test_size=0.3,
    random_state=42
)

# Scaling the features
scaler = MinMaxScaler()
X_train_s_clean = scaler.fit_transform(X_train_clean)
X_test_s_clean = scaler.transform(X_test_clean)

# Train and evaluate the model
lr_clean = LinearRegression()
lr_clean.fit(X_train_s_clean, y_train_clean)
y_pred_test_clean = lr_clean.predict(X_test_s_clean)

# Calculate RMSE and R-squared for the test set
mse_test_clean = mean_squared_error(y_test_clean, y_pred_test_clean)
r2_test_clean = r2_score(y_test_clean, y_pred_test_clean)
rmse_test_clean = np.sqrt(mse_test_clean)

print(f"Test RMSE after cleaning: {rmse_test_clean}")
print(f"Test R-squared after cleaning: {r2_test_clean}")

# For validation, repeat the same steps as before with the cleaned training data
# and use the original validation data
X_valid_s_clean = scaler.transform(X_valid)
y_pred_valid_clean = lr_clean.predict(X_valid_s_clean)

mse_valid_clean = mean_squared_error(y_valid, y_pred_valid_clean)
r2_valid_clean = r2_score(y_valid, y_pred_valid_clean)
rmse_valid_clean = np.sqrt(mse_valid_clean)

print(f"Validation RMSE after cleaning: {rmse_valid_clean}")
print(f"Validation R-squared after cleaning: {r2_valid_clean}")
