## I. Setup & Data Loading

In [None]:
# Install required packages
%pip install lightgbm shap scikit-learn matplotlib seaborn

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Machine learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import shap

# Plotting settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print("‚úì Libraries imported successfully")

In [None]:
# Load compressor features
DATA_DIR = Path('../data/features')
FEATURES_FILE = DATA_DIR / 'compressor_features.csv'

print(f"Loading data from: {FEATURES_FILE}")
df = pd.read_csv(FEATURES_FILE)

# Convert timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"\n‚úì Data loaded successfully")
print(f"  Shape: {df.shape}")
print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"  Compressors: {df['equipment_id'].unique()}")
print(f"\nFirst few rows:")
df.head()

## II. Exploratory Data Analysis

In [None]:
# Dataset overview
print("="*70)
print("COMPRESSOR DATASET OVERVIEW")
print("="*70)

print(f"\nTotal records: {len(df):,}")
print(f"Unique compressors: {df['equipment_id'].nunique()}")
print(f"Features: {df.shape[1]}")

print(f"\nData types:")
print(df.dtypes.value_counts())

print(f"\nMissing values:")
missing = df.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values")

print(f"\nKey statistics:")
print(f"  Mean health index: {df['health_index'].mean():.3f}")
print(f"  Mean efficiency: {df['efficiency_normalized'].mean():.3f}")
print(f"  Mean RUL: {df['rul_days'].mean():.1f} days")
print(f"  Anomaly rate: {df['is_anomaly'].sum() / len(df) * 100:.2f}%")

In [None]:
# Per-compressor statistics
print("\n" + "="*70)
print("PER-COMPRESSOR STATISTICS")
print("="*70)

for comp_id in df['equipment_id'].unique():
    comp_df = df[df['equipment_id'] == comp_id]
    print(f"\n{comp_id}:")
    print(f"  Records: {len(comp_df):,}")
    print(f"  Health: {comp_df['health_index'].mean():.3f}")
    print(f"  Efficiency: {comp_df['efficiency_normalized'].mean():.3f}")
    print(f"  RUL: {comp_df['rul_days'].mean():.1f} days")
    print(f"  Anomalies: {comp_df['is_anomaly'].sum()} ({comp_df['is_anomaly'].sum()/len(comp_df)*100:.1f}%)")

In [None]:
# Visualize key metrics
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Health index distribution
axes[0, 0].hist(df['health_index'], bins=50, edgecolor='black')
axes[0, 0].set_title('Health Index Distribution')
axes[0, 0].set_xlabel('Health Index')
axes[0, 0].set_ylabel('Frequency')

# Efficiency distribution
axes[0, 1].hist(df['efficiency_normalized'], bins=50, edgecolor='black', color='orange')
axes[0, 1].set_title('Efficiency Distribution')
axes[0, 1].set_xlabel('Efficiency (normalized)')
axes[0, 1].set_ylabel('Frequency')

# RUL distribution
axes[0, 2].hist(df['rul_days'], bins=50, edgecolor='black', color='green')
axes[0, 2].set_title('RUL Distribution')
axes[0, 2].set_xlabel('RUL (days)')
axes[0, 2].set_ylabel('Frequency')

# Health trend over time (sample)
sample_df = df[df['equipment_id'] == 'COMP_001'].head(5000)
axes[1, 0].plot(sample_df['timestamp'], sample_df['health_index'])
axes[1, 0].set_title('Health Index Trend (COMP_001 sample)')
axes[1, 0].set_xlabel('Time')
axes[1, 0].set_ylabel('Health Index')
axes[1, 0].tick_params(axis='x', rotation=45)

# Efficiency vs Health
axes[1, 1].scatter(df['efficiency_normalized'], df['health_index'], alpha=0.1)
axes[1, 1].set_title('Efficiency vs Health Index')
axes[1, 1].set_xlabel('Efficiency (normalized)')
axes[1, 1].set_ylabel('Health Index')

# Anomaly rate by compressor
anomaly_rate = df.groupby('equipment_id')['is_anomaly'].mean() * 100
axes[1, 2].bar(anomaly_rate.index, anomaly_rate.values, color=['red', 'orange', 'yellow'])
axes[1, 2].set_title('Anomaly Rate by Compressor')
axes[1, 2].set_xlabel('Compressor')
axes[1, 2].set_ylabel('Anomaly Rate (%)')

plt.tight_layout()
plt.savefig('../models/compressor_evaluation/eda_overview.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì EDA plots saved")

## III. Feature Selection

In [None]:
# Define feature sets
# Exclude identifiers, timestamps, and target variables
exclude_cols = [
    'equipment_id', 'timestamp', 'location', 'manufacturer', 'model',
    'health_index', 'rul_days', 'is_anomaly', 'efficiency_normalized'
]

# Operational features
operational_features = [
    'motor_speed_rpm', 'flow_rate_m3h', 'discharge_pressure_bar',
    'suction_pressure_bar', 'motor_power_kw', 'temperature_c',
    'pressure_ratio', 'specific_power', 'efficiency_proxy', 'load_factor'
]

# Vibration features
vibration_features = [
    'vibration_rms_mms', 'vibration_peak_mms', 'vibration_severity',
    'bearing_health_indicator', 'vibration_trend_slope'
]

# Rolling statistics
rolling_features = [
    'motor_speed_rolling_mean', 'motor_speed_rolling_std',
    'temperature_rolling_mean', 'temperature_rolling_std',
    'vibration_rms_rolling_mean', 'vibration_rms_rolling_std',
    'efficiency_rolling_mean', 'efficiency_rolling_std'
]

# Condition scores
condition_features = ['bearing_condition_score', 'seal_condition_score']

# All features for modeling
all_features = operational_features + vibration_features + rolling_features + condition_features

print(f"Total features for modeling: {len(all_features)}")
print(f"\nFeature categories:")
print(f"  Operational: {len(operational_features)}")
print(f"  Vibration: {len(vibration_features)}")
print(f"  Rolling stats: {len(rolling_features)}")
print(f"  Condition scores: {len(condition_features)}")

# Check for missing features
missing_features = [f for f in all_features if f not in df.columns]
if missing_features:
    print(f"\n‚ö† Warning: Missing features: {missing_features}")
else:
    print(f"\n‚úì All features present in dataset")

## IV. Model 1 - Efficiency Degradation Prediction

In [None]:
# Prepare data for efficiency prediction
print("="*70)
print("MODEL 1: EFFICIENCY DEGRADATION PREDICTION")
print("="*70)

# Remove rows with NaN in rolling features (first few rows)
df_clean = df.dropna(subset=all_features + ['efficiency_normalized'])

X_eff = df_clean[all_features]
y_eff = df_clean['efficiency_normalized']

# Train/test split (80/20, stratified by compressor)
X_train_eff, X_test_eff, y_train_eff, y_test_eff = train_test_split(
    X_eff, y_eff, test_size=0.2, random_state=42
)

print(f"\nData split:")
print(f"  Train: {len(X_train_eff):,} samples")
print(f"  Test: {len(X_test_eff):,} samples")
print(f"  Features: {X_train_eff.shape[1]}")

In [None]:
# Train LightGBM regressor for efficiency
params_eff = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

print("\nTraining LightGBM model...")
train_data_eff = lgb.Dataset(X_train_eff, label=y_train_eff)
test_data_eff = lgb.Dataset(X_test_eff, label=y_test_eff, reference=train_data_eff)

model_eff = lgb.train(
    params_eff,
    train_data_eff,
    num_boost_round=1000,
    valid_sets=[train_data_eff, test_data_eff],
    valid_names=['train', 'test'],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(100)]
)

print("\n‚úì Model training complete")

In [None]:
# Evaluate efficiency model
y_pred_eff_train = model_eff.predict(X_train_eff, num_iteration=model_eff.best_iteration)
y_pred_eff_test = model_eff.predict(X_test_eff, num_iteration=model_eff.best_iteration)

# Calculate metrics
train_rmse_eff = np.sqrt(mean_squared_error(y_train_eff, y_pred_eff_train))
test_rmse_eff = np.sqrt(mean_squared_error(y_test_eff, y_pred_eff_test))
train_mae_eff = mean_absolute_error(y_train_eff, y_pred_eff_train)
test_mae_eff = mean_absolute_error(y_test_eff, y_pred_eff_test)
train_r2_eff = r2_score(y_train_eff, y_pred_eff_train)
test_r2_eff = r2_score(y_test_eff, y_pred_eff_test)

print("\n" + "="*70)
print("EFFICIENCY MODEL EVALUATION")
print("="*70)
print(f"\nTrain Set:")
print(f"  RMSE: {train_rmse_eff:.4f}")
print(f"  MAE:  {train_mae_eff:.4f}")
print(f"  R¬≤:   {train_r2_eff:.4f}")
print(f"\nTest Set:")
print(f"  RMSE: {test_rmse_eff:.4f}")
print(f"  MAE:  {test_mae_eff:.4f}")
print(f"  R¬≤:   {test_r2_eff:.4f}")

# Save metrics
metrics_eff = {
    'model': 'Efficiency Degradation',
    'train_rmse': float(train_rmse_eff),
    'test_rmse': float(test_rmse_eff),
    'train_mae': float(train_mae_eff),
    'test_mae': float(test_mae_eff),
    'train_r2': float(train_r2_eff),
    'test_r2': float(test_r2_eff)
}

import json
Path('../models/compressor_evaluation').mkdir(parents=True, exist_ok=True)
with open('../models/compressor_evaluation/metrics_efficiency.json', 'w') as f:
    json.dump(metrics_eff, f, indent=2)

print("\n‚úì Metrics saved")

In [None]:
# Plot actual vs predicted (efficiency)
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Test set
axes[0].scatter(y_test_eff, y_pred_eff_test, alpha=0.3)
axes[0].plot([y_test_eff.min(), y_test_eff.max()], 
             [y_test_eff.min(), y_test_eff.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Efficiency')
axes[0].set_ylabel('Predicted Efficiency')
axes[0].set_title(f'Efficiency Model - Test Set\nR¬≤ = {test_r2_eff:.3f}, RMSE = {test_rmse_eff:.4f}')
axes[0].grid(True, alpha=0.3)

# Residuals
residuals_eff = y_test_eff - y_pred_eff_test
axes[1].scatter(y_pred_eff_test, residuals_eff, alpha=0.3)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Efficiency')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../models/compressor_evaluation/efficiency_predictions.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Prediction plots saved")

## V. Model 2 - RUL Prediction

In [None]:
# Prepare data for RUL prediction
print("="*70)
print("MODEL 2: RUL PREDICTION")
print("="*70)

X_rul = df_clean[all_features]
y_rul = df_clean['rul_days']

# Train/test split
X_train_rul, X_test_rul, y_train_rul, y_test_rul = train_test_split(
    X_rul, y_rul, test_size=0.2, random_state=42
)

print(f"\nData split:")
print(f"  Train: {len(X_train_rul):,} samples")
print(f"  Test: {len(X_test_rul):,} samples")
print(f"  RUL range: {y_rul.min():.0f} - {y_rul.max():.0f} days")

In [None]:
# Train LightGBM regressor for RUL
params_rul = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

print("\nTraining LightGBM model for RUL...")
train_data_rul = lgb.Dataset(X_train_rul, label=y_train_rul)
test_data_rul = lgb.Dataset(X_test_rul, label=y_test_rul, reference=train_data_rul)

model_rul = lgb.train(
    params_rul,
    train_data_rul,
    num_boost_round=1000,
    valid_sets=[train_data_rul, test_data_rul],
    valid_names=['train', 'test'],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(100)]
)

print("\n‚úì RUL model training complete")

In [None]:
# Evaluate RUL model
y_pred_rul_train = model_rul.predict(X_train_rul, num_iteration=model_rul.best_iteration)
y_pred_rul_test = model_rul.predict(X_test_rul, num_iteration=model_rul.best_iteration)

# Calculate metrics
train_rmse_rul = np.sqrt(mean_squared_error(y_train_rul, y_pred_rul_train))
test_rmse_rul = np.sqrt(mean_squared_error(y_test_rul, y_pred_rul_test))
train_mae_rul = mean_absolute_error(y_train_rul, y_pred_rul_train)
test_mae_rul = mean_absolute_error(y_test_rul, y_pred_rul_test)
train_r2_rul = r2_score(y_train_rul, y_pred_rul_train)
test_r2_rul = r2_score(y_test_rul, y_pred_rul_test)

print("\n" + "="*70)
print("RUL MODEL EVALUATION")
print("="*70)
print(f"\nTrain Set:")
print(f"  RMSE: {train_rmse_rul:.1f} days")
print(f"  MAE:  {train_mae_rul:.1f} days")
print(f"  R¬≤:   {train_r2_rul:.4f}")
print(f"\nTest Set:")
print(f"  RMSE: {test_rmse_rul:.1f} days")
print(f"  MAE:  {test_mae_rul:.1f} days")
print(f"  R¬≤:   {test_r2_rul:.4f}")

# Save metrics
metrics_rul = {
    'model': 'RUL Prediction',
    'train_rmse': float(train_rmse_rul),
    'test_rmse': float(test_rmse_rul),
    'train_mae': float(train_mae_rul),
    'test_mae': float(test_mae_rul),
    'train_r2': float(train_r2_rul),
    'test_r2': float(test_r2_rul)
}

with open('../models/compressor_evaluation/metrics_rul.json', 'w') as f:
    json.dump(metrics_rul, f, indent=2)

print("\n‚úì Metrics saved")

In [None]:
# Plot RUL predictions
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Actual vs Predicted
axes[0].scatter(y_test_rul, y_pred_rul_test, alpha=0.3)
axes[0].plot([y_test_rul.min(), y_test_rul.max()], 
             [y_test_rul.min(), y_test_rul.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual RUL (days)')
axes[0].set_ylabel('Predicted RUL (days)')
axes[0].set_title(f'RUL Model - Test Set\nR¬≤ = {test_r2_rul:.3f}, RMSE = {test_rmse_rul:.1f} days')
axes[0].grid(True, alpha=0.3)

# Residuals
residuals_rul = y_test_rul - y_pred_rul_test
axes[1].scatter(y_pred_rul_test, residuals_rul, alpha=0.3)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted RUL (days)')
axes[1].set_ylabel('Residuals (days)')
axes[1].set_title('Residual Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../models/compressor_evaluation/rul_predictions.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì RUL prediction plots saved")

## VI. Model 3 - Anomaly Classification

In [None]:
# Prepare data for anomaly classification
print("="*70)
print("MODEL 3: ANOMALY CLASSIFICATION")
print("="*70)

X_anom = df_clean[all_features]
y_anom = df_clean['is_anomaly'].astype(int)

# Train/test split
X_train_anom, X_test_anom, y_train_anom, y_test_anom = train_test_split(
    X_anom, y_anom, test_size=0.2, random_state=42, stratify=y_anom
)

print(f"\nData split:")
print(f"  Train: {len(X_train_anom):,} samples")
print(f"  Test: {len(X_test_anom):,} samples")
print(f"\nClass distribution (train):")
print(f"  Normal: {(y_train_anom == 0).sum():,} ({(y_train_anom == 0).sum()/len(y_train_anom)*100:.1f}%)")
print(f"  Anomaly: {(y_train_anom == 1).sum():,} ({(y_train_anom == 1).sum()/len(y_train_anom)*100:.1f}%)")

In [None]:
# Train LightGBM classifier for anomalies
params_anom = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'is_unbalance': True,  # Handle class imbalance
    'verbose': -1,
    'random_state': 42
}

print("\nTraining LightGBM classifier...")
train_data_anom = lgb.Dataset(X_train_anom, label=y_train_anom)
test_data_anom = lgb.Dataset(X_test_anom, label=y_test_anom, reference=train_data_anom)

model_anom = lgb.train(
    params_anom,
    train_data_anom,
    num_boost_round=1000,
    valid_sets=[train_data_anom, test_data_anom],
    valid_names=['train', 'test'],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(100)]
)

print("\n‚úì Anomaly model training complete")

In [None]:
# Evaluate anomaly model
y_pred_anom_prob_test = model_anom.predict(X_test_anom, num_iteration=model_anom.best_iteration)
y_pred_anom_test = (y_pred_anom_prob_test > 0.5).astype(int)

# Calculate metrics
acc = accuracy_score(y_test_anom, y_pred_anom_test)
prec = precision_score(y_test_anom, y_pred_anom_test)
rec = recall_score(y_test_anom, y_pred_anom_test)
f1 = f1_score(y_test_anom, y_pred_anom_test)

print("\n" + "="*70)
print("ANOMALY MODEL EVALUATION")
print("="*70)
print(f"\nTest Set Metrics:")
print(f"  Accuracy:  {acc:.4f}")
print(f"  Precision: {prec:.4f}")
print(f"  Recall:    {rec:.4f}")
print(f"  F1-Score:  {f1:.4f}")

print(f"\nClassification Report:")
print(classification_report(y_test_anom, y_pred_anom_test, target_names=['Normal', 'Anomaly']))

# Confusion matrix
cm = confusion_matrix(y_test_anom, y_pred_anom_test)
print(f"\nConfusion Matrix:")
print(f"                Predicted")
print(f"Actual    Normal  Anomaly")
print(f"Normal    {cm[0,0]:6d}  {cm[0,1]:6d}")
print(f"Anomaly   {cm[1,0]:6d}  {cm[1,1]:6d}")

# Save metrics
metrics_anom = {
    'model': 'Anomaly Classification',
    'accuracy': float(acc),
    'precision': float(prec),
    'recall': float(rec),
    'f1_score': float(f1)
}

with open('../models/compressor_evaluation/metrics_anomaly.json', 'w') as f:
    json.dump(metrics_anom, f, indent=2)

print("\n‚úì Metrics saved")

In [None]:
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
plt.title(f'Confusion Matrix - Anomaly Classification\nAccuracy: {acc:.3f}, F1: {f1:.3f}')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig('../models/compressor_evaluation/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Confusion matrix saved")

## VII. SHAP Feature Importance Analysis

In [None]:
# SHAP analysis for efficiency model
print("Calculating SHAP values for Efficiency model...")
explainer_eff = shap.TreeExplainer(model_eff)
shap_values_eff = explainer_eff.shap_values(X_test_eff.head(1000))  # Sample for speed

# SHAP summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values_eff, X_test_eff.head(1000), show=False)
plt.title('SHAP Feature Importance - Efficiency Model', fontsize=14, pad=20)
plt.tight_layout()
plt.savefig('../models/compressor_evaluation/shap_efficiency.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì SHAP analysis complete for Efficiency model")

In [None]:
# SHAP analysis for RUL model
print("Calculating SHAP values for RUL model...")
explainer_rul = shap.TreeExplainer(model_rul)
shap_values_rul = explainer_rul.shap_values(X_test_rul.head(1000))

# SHAP summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values_rul, X_test_rul.head(1000), show=False)
plt.title('SHAP Feature Importance - RUL Model', fontsize=14, pad=20)
plt.tight_layout()
plt.savefig('../models/compressor_evaluation/shap_rul.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì SHAP analysis complete for RUL model")

In [None]:
# Feature importance comparison
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Efficiency model
importance_eff = pd.DataFrame({
    'feature': all_features,
    'importance': model_eff.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False).head(15)

axes[0].barh(range(len(importance_eff)), importance_eff['importance'])
axes[0].set_yticks(range(len(importance_eff)))
axes[0].set_yticklabels(importance_eff['feature'])
axes[0].set_xlabel('Importance (Gain)')
axes[0].set_title('Top 15 Features - Efficiency Model')
axes[0].invert_yaxis()

# RUL model
importance_rul = pd.DataFrame({
    'feature': all_features,
    'importance': model_rul.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False).head(15)

axes[1].barh(range(len(importance_rul)), importance_rul['importance'])
axes[1].set_yticks(range(len(importance_rul)))
axes[1].set_yticklabels(importance_rul['feature'])
axes[1].set_xlabel('Importance (Gain)')
axes[1].set_title('Top 15 Features - RUL Model')
axes[1].invert_yaxis()

plt.tight_layout()
plt.savefig('../models/compressor_evaluation/feature_importance_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Feature importance comparison saved")

## VIII. Predictions & Maintenance Scheduling

In [None]:
# Generate predictions for all compressors
print("="*70)
print("GENERATING PREDICTIONS FOR MAINTENANCE SCHEDULING")
print("="*70)

# Get latest data point per compressor
latest_data = df_clean.groupby('equipment_id').last().reset_index()

X_latest = latest_data[all_features]

# Predictions
pred_efficiency = model_eff.predict(X_latest, num_iteration=model_eff.best_iteration)
pred_rul = model_rul.predict(X_latest, num_iteration=model_rul.best_iteration)
pred_anomaly_prob = model_anom.predict(X_latest, num_iteration=model_anom.best_iteration)
pred_anomaly = (pred_anomaly_prob > 0.5).astype(int)

# Create summary DataFrame
summary = pd.DataFrame({
    'equipment_id': latest_data['equipment_id'],
    'current_efficiency': latest_data['efficiency_normalized'].values,
    'predicted_efficiency': pred_efficiency,
    'efficiency_change': pred_efficiency - latest_data['efficiency_normalized'].values,
    'current_rul_days': latest_data['rul_days'].values,
    'predicted_rul_days': pred_rul,
    'rul_change': pred_rul - latest_data['rul_days'].values,
    'anomaly_probability': pred_anomaly_prob,
    'is_anomaly_predicted': pred_anomaly,
    'current_health': latest_data['health_index'].values
})

print("\nPrediction Summary:")
print(summary.to_string(index=False))

# Save predictions
summary.to_csv('../models/compressor_evaluation/predictions_summary.csv', index=False)
print("\n‚úì Predictions saved to CSV")

In [None]:
# Maintenance priority ranking
print("\n" + "="*70)
print("MAINTENANCE PRIORITY RANKING")
print("="*70)

# Calculate priority score (lower = more urgent)
summary['priority_score'] = (
    summary['predicted_rul_days'] * 0.40 +  # Lower RUL = higher priority
    (1 - summary['predicted_efficiency']) * 1000 * 0.30 +  # Lower efficiency = higher priority
    summary['anomaly_probability'] * 500 * 0.30  # Higher anomaly prob = higher priority
)

summary_sorted = summary.sort_values('priority_score')

# Assign priority levels
def assign_priority(row):
    if row['predicted_rul_days'] < 365 or row['anomaly_probability'] > 0.8:
        return 'P1 - Immediate'
    elif row['predicted_rul_days'] < 730 or row['predicted_efficiency'] < 0.3:
        return 'P2 - Urgent'
    elif row['predicted_rul_days'] < 1825:
        return 'P3 - Scheduled'
    else:
        return 'P4 - Normal'

summary_sorted['priority_level'] = summary_sorted.apply(assign_priority, axis=1)

print("\nMaintenance Schedule (by priority):")
print(summary_sorted[[
    'equipment_id', 'priority_level', 'predicted_rul_days', 
    'predicted_efficiency', 'anomaly_probability'
]].to_string(index=False))

# Save maintenance schedule
summary_sorted.to_csv('../models/compressor_evaluation/maintenance_schedule.csv', index=False)
print("\n‚úì Maintenance schedule saved")

## IX. Model Export & Summary

In [None]:
# Save models
print("="*70)
print("SAVING MODELS")
print("="*70)

model_eff.save_model('../models/compressor_efficiency_model.txt')
print("‚úì Efficiency model saved: compressor_efficiency_model.txt")

model_rul.save_model('../models/compressor_rul_model.txt')
print("‚úì RUL model saved: compressor_rul_model.txt")

model_anom.save_model('../models/compressor_anomaly_model.txt')
print("‚úì Anomaly model saved: compressor_anomaly_model.txt")

In [None]:
# Final summary
print("\n" + "="*70)
print("COMPRESSOR MODELING - FINAL SUMMARY")
print("="*70)

print("\nüìä Dataset:")
print(f"  Total records: {len(df):,}")
print(f"  Compressors: {df['equipment_id'].nunique()}")
print(f"  Features: {len(all_features)}")

print("\nüéØ Model 1 - Efficiency Degradation:")
print(f"  Test RMSE: {test_rmse_eff:.4f}")
print(f"  Test R¬≤: {test_r2_eff:.4f}")
print(f"  Top feature: {importance_eff.iloc[0]['feature']}")

print("\nüéØ Model 2 - RUL Prediction:")
print(f"  Test RMSE: {test_rmse_rul:.1f} days")
print(f"  Test R¬≤: {test_r2_rul:.4f}")
print(f"  Top feature: {importance_rul.iloc[0]['feature']}")

print("\nüéØ Model 3 - Anomaly Classification:")
print(f"  Accuracy: {acc:.4f}")
print(f"  F1-Score: {f1:.4f}")
print(f"  Precision: {prec:.4f}")
print(f"  Recall: {rec:.4f}")

print("\nüîß Maintenance Recommendations:")
for priority in ['P1 - Immediate', 'P2 - Urgent', 'P3 - Scheduled', 'P4 - Normal']:
    count = (summary_sorted['priority_level'] == priority).sum()
    if count > 0:
        equipment = summary_sorted[summary_sorted['priority_level'] == priority]['equipment_id'].tolist()
        print(f"  {priority}: {count} compressor(s) - {', '.join(equipment)}")

print("\nüìÅ Saved Files:")
print("  Models:")
print("    - compressor_efficiency_model.txt")
print("    - compressor_rul_model.txt")
print("    - compressor_anomaly_model.txt")
print("\n  Evaluation:")
print("    - metrics_efficiency.json")
print("    - metrics_rul.json")
print("    - metrics_anomaly.json")
print("    - efficiency_predictions.png")
print("    - rul_predictions.png")
print("    - confusion_matrix.png")
print("    - shap_efficiency.png")
print("    - shap_rul.png")
print("    - feature_importance_comparison.png")
print("\n  Predictions:")
print("    - predictions_summary.csv")
print("    - maintenance_schedule.csv")

print("\n" + "="*70)
print("‚úì COMPRESSOR MODELING COMPLETE")
print("="*70)