In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.config import get_config
from src.features.feature_engineering import FeatureEngineer
from src.models.isolation_forest_model import IsolationForestModel
from src.models.model_trainer import ModelTrainer

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Imports successful")

# %% Cell 2: Load Features
config = get_config()
engineer = FeatureEngineer(config)

print("Loading features...")
features_df = engineer.load_features('machine_001_features.csv')

print(f"\nDataset Info:")
print(f"  Shape: {features_df.shape}")
print(f"  Anomalies: {features_df['is_anomaly'].sum()} ({features_df['is_anomaly'].mean()*100:.2f}%)")
print(f"  Time range: {features_df['window_start'].min()} to {features_df['window_end'].max()}")

# %% Cell 3: Data Preparation
trainer = ModelTrainer(config)

print("Preparing data...")
X_train, X_val, X_test, y_train, y_val, y_test = trainer.prepare_data(
    features_df,
    test_size=0.2,
    val_size=0.1
)

print(f"\n✓ Data prepared:")
print(f"  Train: {len(X_train)} samples")
print(f"  Val:   {len(X_val)} samples")
print(f"  Test:  {len(X_test)} samples")
print(f"  Features: {X_train.shape[1]}")

# %% Cell 4: Train Model
print("\n" + "="*80)
print("TRAINING ISOLATION FOREST")
print("="*80)

model = IsolationForestModel(config)
model.train(X_train, y_train)

print("\n✓ Training complete!")

# %% Cell 5: Evaluate Model
print("\n" + "="*80)
print("MODEL EVALUATION")
print("="*80)

# Evaluate on test set
metrics = model.evaluate(X_test, y_test)
model.print_metrics()

# Get predictions
y_pred = model.predict(X_test)
y_scores = model.predict_scores(X_test)

# %% Cell 6: Confusion Matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
           xticklabels=['Normal', 'Anomaly'],
           yticklabels=['Normal', 'Anomaly'],
           cbar_kws={'label': 'Count'})

ax.set_xlabel('Predicted', fontsize=12, fontweight='bold')
ax.set_ylabel('Actual', fontsize=12, fontweight='bold')
ax.set_title('Confusion Matrix - Isolation Forest', fontsize=14, fontweight='bold')

# Add percentages
for i in range(2):
    for j in range(2):
        percentage = cm[i, j] / cm.sum() * 100
        ax.text(j+0.5, i+0.7, f'({percentage:.1f}%)', 
               ha='center', va='center', fontsize=10, color='gray')

plt.tight_layout()
plt.savefig('../results/figures/confusion_matrix_notebook.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nConfusion Matrix:")
print(f"  True Negatives:  {cm[0,0]:,} (Correctly identified normal)")
print(f"  False Positives: {cm[0,1]:,} (False alarms)")
print(f"  False Negatives: {cm[1,0]:,} (Missed anomalies)")
print(f"  True Positives:  {cm[1,1]:,} (Correctly detected anomalies)")

# %% Cell 7: ROC Curve
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)

fig, ax = plt.subplots(figsize=(8, 6))

ax.plot(fpr, tpr, linewidth=2, label=f'Isolation Forest (AUC = {roc_auc:.3f})')
ax.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')

ax.set_xlabel('False Positive Rate', fontsize=12, fontweight='bold')
ax.set_ylabel('True Positive Rate', fontsize=12, fontweight='bold')
ax.set_title('ROC Curve', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/roc_curve_notebook.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nROC-AUC Score: {roc_auc:.4f}")
print(f"Interpretation: {'Excellent' if roc_auc > 0.9 else 'Good' if roc_auc > 0.8 else 'Fair'}")

# %% Cell 8: Precision-Recall Curve
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, thresholds_pr = precision_recall_curve(y_test, y_scores)
ap = average_precision_score(y_test, y_scores)

fig, ax = plt.subplots(figsize=(8, 6))

ax.plot(recall, precision, linewidth=2, label=f'Isolation Forest (AP = {ap:.3f})')
ax.axhline(y=y_test.mean(), color='r', linestyle='--', linewidth=1, 
          label=f'Baseline (Random): {y_test.mean():.3f}')

ax.set_xlabel('Recall', fontsize=12, fontweight='bold')
ax.set_ylabel('Precision', fontsize=12, fontweight='bold')
ax.set_title('Precision-Recall Curve', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/pr_curve_notebook.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nAverage Precision: {ap:.4f}")

# %% Cell 9: Threshold Optimization
print("\n" + "="*80)
print("THRESHOLD OPTIMIZATION")
print("="*80)

# Calculate F1 scores for different thresholds
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)

# Find optimal threshold
best_idx = np.argmax(f1_scores)
optimal_threshold = thresholds_pr[best_idx]
best_precision = precision[best_idx]
best_recall = recall[best_idx]
best_f1 = f1_scores[best_idx]

print(f"\nOptimal Threshold: {optimal_threshold:.4f}")
print(f"  Precision: {best_precision:.4f}")
print(f"  Recall:    {best_recall:.4f}")
print(f"  F1-Score:  {best_f1:.4f}")

# Plot threshold vs metrics
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(thresholds_pr, precision[:-1], label='Precision', linewidth=2)
ax.plot(thresholds_pr, recall[:-1], label='Recall', linewidth=2)
ax.plot(thresholds_pr, f1_scores[:-1], label='F1-Score', linewidth=2)
ax.axvline(optimal_threshold, color='red', linestyle='--', linewidth=2, 
          label=f'Optimal: {optimal_threshold:.4f}')

ax.set_xlabel('Threshold', fontsize=12, fontweight='bold')
ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_title('Threshold Optimization', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/threshold_optimization.png', dpi=300, bbox_inches='tight')
plt.show()

# %% Cell 10: Anomaly Score Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(y_scores[y_test == 0], bins=50, alpha=0.6, label='Normal', 
            color='green', density=True, edgecolor='black')
axes[0].hist(y_scores[y_test == 1], bins=50, alpha=0.6, label='Anomaly', 
            color='red', density=True, edgecolor='black')
axes[0].set_xlabel('Anomaly Score', fontsize=12)
axes[0].set_ylabel('Density', fontsize=12)
axes[0].set_title('Anomaly Score Distribution', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Box plot
data_to_plot = [y_scores[y_test == 0], y_scores[y_test == 1]]
bp = axes[1].boxplot(data_to_plot, labels=['Normal', 'Anomaly'], patch_artist=True)
bp['boxes'][0].set_facecolor('green')
bp['boxes'][1].set_facecolor('red')
for box in bp['boxes']:
    box.set_alpha(0.6)
axes[1].set_ylabel('Anomaly Score', fontsize=12)
axes[1].set_title('Anomaly Score by Class', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../results/figures/anomaly_score_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nAnomaly Score Statistics:")
print(f"  Normal   - Mean: {y_scores[y_test == 0].mean():.4f}, Std: {y_scores[y_test == 0].std():.4f}")
print(f"  Anomaly  - Mean: {y_scores[y_test == 1].mean():.4f}, Std: {y_scores[y_test == 1].std():.4f}")
print(f"  Separation: {abs(y_scores[y_test == 1].mean() - y_scores[y_test == 0].mean()):.4f}")

# %% Cell 11: Prediction Examples
print("\n" + "="*80)
print("PREDICTION EXAMPLES")
print("="*80)

# Show some examples
num_examples = 10
example_indices = np.random.choice(len(X_test), num_examples, replace=False)

examples_df = pd.DataFrame({
    'Actual': y_test.iloc[example_indices].values,
    'Predicted': y_pred[example_indices],
    'Anomaly_Score': y_scores[example_indices],
    'Correct': (y_test.iloc[example_indices].values == y_pred[example_indices])
})

examples_df['Actual'] = examples_df['Actual'].map({0: 'Normal', 1: 'Anomaly'})
examples_df['Predicted'] = examples_df['Predicted'].map({0: 'Normal', 1: 'Anomaly'})

print("\nSample Predictions:")
print(examples_df.to_string(index=False))

# %% Cell 12: Feature Importance
print("\n" + "="*80)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*80)

print("\nCalculating feature importance (using subset of test data)...")
importance = model.get_feature_importance(X_test.iloc[:100], n_top=20)

print("\nTop 20 Most Important Features:")
print(importance.to_string(index=False))

# Visualize
fig, ax = plt.subplots(figsize=(10, 8))

y_pos = np.arange(len(importance))
ax.barh(y_pos, importance['importance'], color='skyblue', edgecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels([f.replace('vibration_rms_', 'vib_')[:40] for f in importance['feature']], 
                   fontsize=9)
ax.set_xlabel('Importance Score', fontsize=12, fontweight='bold')
ax.set_title('Top 20 Most Important Features', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig('../results/figures/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# %% Cell 13: Business Impact Analysis
print("\n" + "="*80)
print("BUSINESS IMPACT ANALYSIS")
print("="*80)

# Get business costs from config
cost_downtime = config.get('business.cost_unplanned_downtime', 5000)  # per hour
cost_maintenance = config.get('business.cost_planned_maintenance', 500)
cost_false_alarm = config.get('business.cost_false_alarm', 100)

# Calculate costs
tp = metrics['true_positive']
fp = metrics['false_positive']
fn = metrics['false_negative']
tn = metrics['true_negative']

# Assuming each anomaly if undetected leads to 4 hours downtime
downtime_prevented = tp * 4 * cost_downtime
false_alarm_cost = fp * cost_false_alarm
missed_failures_cost = fn * 4 * cost_downtime
maintenance_cost = tp * cost_maintenance

net_benefit = downtime_prevented - false_alarm_cost - maintenance_cost
roi = (net_benefit / (false_alarm_cost + maintenance_cost)) * 100 if (false_alarm_cost + maintenance_cost) > 0 else 0

print(f"\nBusiness Metrics:")
print(f"  Downtime Prevented:    ${downtime_prevented:,.2f}")
print(f"  False Alarm Costs:     ${false_alarm_cost:,.2f}")
print(f"  Missed Failure Costs:  ${missed_failures_cost:,.2f}")
print(f"  Maintenance Costs:     ${maintenance_cost:,.2f}")
print(f"  Net Benefit:           ${net_benefit:,.2f}")
print(f"  ROI:                   {roi:.1f}%")

# %% Cell 14: Save Model
print("\n" + "="*80)
print("SAVING MODEL")
print("="*80)

model_path = config.get('paths.models', 'models/saved_models') + '/isolation_forest.pkl'
model.save(model_path)
print(f"✓ Model saved to: {model_path}")

# %% Cell 15: Summary
print("\n" + "="*80)
print("TRAINING SUMMARY")
print("="*80)

print(f"\n✓ Model: Isolation Forest")
print(f"✓ Training set: {len(X_train):,} samples")
print(f"✓ Test set: {len(X_test):,} samples")
print(f"✓ Features: {X_train.shape[1]}")

print(f"\n✓ Performance:")
print(f"  Accuracy:  {metrics['accuracy']:.2%}")
print(f"  Precision: {metrics['precision']:.2%}")
print(f"  Recall:    {metrics['recall']:.2%}")
print(f"  F1-Score:  {metrics['f1_score']:.2%}")
print(f"  ROC-AUC:   {metrics['roc_auc']:.4f}")

print(f"\n✓ Business Impact:")
print(f"  Net Benefit: ${net_benefit:,.2f}")
print(f"  ROI: {roi:.1f}%")

print(f"\n✓ Model Status: Ready for deployment!")

print("\n" + "="*80)
print("✓ Training complete!")
print("="*80)


