# Model Trainer Notebook
## Steel Industry Load Type Prediction

This notebook handles the model training phase of our ML pipeline.

**Objectives:**
- Load the transformed data
- Split data into training and testing sets
- Train multiple machine learning models
- Evaluate and compare model performance
- Perform hyperparameter tuning
- Save the best performing model
- Generate comprehensive evaluation reports

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve, average_precision_score

# Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# Additional ML libraries
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False

try:
    import lightgbm as lgb
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Model Training Phase Started")
print(f"Timestamp: {datetime.now()}")
print(f"XGBoost available: {XGBOOST_AVAILABLE}")
print(f"LightGBM available: {LIGHTGBM_AVAILABLE}")

Model Training Phase Started
Timestamp: 2025-10-13 10:03:31.905605
XGBoost available: False
LightGBM available: False


## 1. Load Transformed Data

In [2]:
# Load the transformed data
input_filename = 'transformed_data.csv'

try:
    df = pd.read_csv(input_filename)
    print(f"✓ Data loaded successfully from {input_filename}")
    print(f"Dataset shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
except FileNotFoundError:
    print(f"❌ Error: {input_filename} not found. Please run 03_data_transformation.ipynb first.")
    raise
except Exception as e:
    print(f"❌ Error loading data: {e}")
    raise

✓ Data loaded successfully from transformed_data.csv
Dataset shape: (35040, 38)
Memory usage: 9.78 MB


In [3]:
# Load additional artifacts
try:
    # Load encoders and metadata
    encoders = joblib.load('encoders.pkl')
    feature_names = joblib.load('feature_names.pkl')
    transformation_metadata = joblib.load('transformation_metadata.pkl')

    print("✓ Additional artifacts loaded successfully")
    print(f"Feature names: {len(feature_names)} features")
    print(f"Target classes: {transformation_metadata['target_classes']}")

except Exception as e:
    print(f"⚠️ Warning: Could not load additional artifacts: {e}")
    feature_names = [col for col in df.columns if col != 'target']

✓ Additional artifacts loaded successfully
Feature names: 37 features
Target classes: {'Light_Load': np.int64(0), 'Maximum_Load': np.int64(1), 'Medium_Load': np.int64(2)}


In [4]:
# Quick overview of loaded data
print("=== LOADED DATA OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Features: {len(feature_names)}")
print(f"\nTarget distribution:")
target_distribution = df['target'].value_counts().sort_index()
print(target_distribution)
print(f"\nClass balance:")
for class_val, count in target_distribution.items():
    print(f"  Class {class_val}: {count:,} samples ({count/len(df)*100:.1f}%)")

# Check for data quality
print(f"\nData quality check:")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Infinite values: {np.isinf(df.select_dtypes(include=[np.number])).sum().sum()}")

=== LOADED DATA OVERVIEW ===
Shape: (35040, 38)
Features: 37

Target distribution:
target
0    18072
1     7272
2     9696
Name: count, dtype: int64

Class balance:
  Class 0: 18,072 samples (51.6%)
  Class 1: 7,272 samples (20.8%)
  Class 2: 9,696 samples (27.7%)

Data quality check:
Missing values: 0
Infinite values: 0


## 2. Prepare Data for Modeling

In [5]:
# Prepare features and target
print("=== PREPARING DATA FOR MODELING ===")

# Separate features and target
X = df[feature_names].copy()
y = df['target'].copy()

print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"Target classes: {sorted(y.unique())}")

# Check for any remaining data quality issues
print(f"\nFinal data quality check:")
print(f"Features - Missing values: {X.isnull().sum().sum()}")
# print(f"Features - Infinite values: {np.isinf(X).sum().sum()}")
print(f"Target - Missing values: {y.isnull().sum()}")

# Handle any remaining issues
if X.isnull().sum().sum() > 0:
    print("⚠️ Filling remaining missing values in features...")
    X = X.fillna(X.median())

# if np.isinf(X).sum().sum() > 0:
#     print("⚠️ Replacing infinite values in features...")
#     X = X.replace([np.inf, -np.inf], [X.max().max(), X.min().min()])

print("✅ Data is ready for modeling!")

=== PREPARING DATA FOR MODELING ===
Features (X): (35040, 37)
Target (y): (35040,)
Target classes: [np.int64(0), np.int64(1), np.int64(2)]

Final data quality check:
Features - Missing values: 0
Target - Missing values: 0
✅ Data is ready for modeling!


## 3. Data Splitting

In [6]:
# Split the data into training and testing sets
print("=== DATA SPLITTING ===")

# Split with stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Testing set: {X_test.shape[0]:,} samples")
print(f"Features: {X_train.shape[1]}")

# Check class balance in splits
print(f"\nClass distribution in training set:")
train_dist = y_train.value_counts().sort_index()
for class_val, count in train_dist.items():
    print(f"  Class {class_val}: {count:,} samples ({count/len(y_train)*100:.1f}%)")

print(f"\nClass distribution in testing set:")
test_dist = y_test.value_counts().sort_index()
for class_val, count in test_dist.items():
    print(f"  Class {class_val}: {count:,} samples ({count/len(y_test)*100:.1f}%)")

# Further split training data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

print(f"\nAfter validation split:")
print(f"Training set: {X_train_split.shape[0]:,} samples")
print(f"Validation set: {X_val.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")

=== DATA SPLITTING ===
Training set: 28,032 samples
Testing set: 7,008 samples
Features: 37

Class distribution in training set:
  Class 0: 14,457 samples (51.6%)
  Class 1: 5,818 samples (20.8%)
  Class 2: 7,757 samples (27.7%)

Class distribution in testing set:
  Class 0: 3,615 samples (51.6%)
  Class 1: 1,454 samples (20.7%)
  Class 2: 1,939 samples (27.7%)

After validation split:
Training set: 22,425 samples
Validation set: 5,607 samples
Test set: 7,008 samples


## 4. Model Definition and Initial Training

In [7]:
# Define models to train
print("=== MODEL DEFINITION AND INITIAL TRAINING ===")

# Initialize models with default parameters
models = {
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Extra Trees': ExtraTreesClassifier(random_state=42, n_jobs=-1),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(random_state=42, max_iter=500),
    'AdaBoost': AdaBoostClassifier(random_state=42)
}

# Add XGBoost and LightGBM if available
if XGBOOST_AVAILABLE:
    models['XGBoost'] = xgb.XGBClassifier(random_state=42, eval_metric='mlogloss')
    print("✓ XGBoost added to model list")

if LIGHTGBM_AVAILABLE:
    models['LightGBM'] = lgb.LGBMClassifier(random_state=42, verbose=-1)
    print("✓ LightGBM added to model list")

print(f"\nTotal models to train: {len(models)}")
print(f"Models: {list(models.keys())}")

# Initialize results storage
model_results = []
trained_models = {}

print("\n=== TRAINING MODELS WITH DEFAULT PARAMETERS ===")

=== MODEL DEFINITION AND INITIAL TRAINING ===

Total models to train: 10
Models: ['Random Forest', 'Gradient Boosting', 'Extra Trees', 'Logistic Regression', 'Decision Tree', 'SVM', 'K-Nearest Neighbors', 'Naive Bayes', 'Neural Network', 'AdaBoost']

=== TRAINING MODELS WITH DEFAULT PARAMETERS ===


In [8]:
# Train all models and collect initial results
for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = datetime.now()

    try:
        # Train the model
        model.fit(X_train_split, y_train_split)

        # Make predictions on validation set
        y_val_pred = model.predict(X_val)
        y_val_pred_proba = model.predict_proba(X_val) if hasattr(model, 'predict_proba') else None

        # Calculate metrics
        accuracy = accuracy_score(y_val, y_val_pred)
        precision = precision_score(y_val, y_val_pred, average='weighted')
        recall = recall_score(y_val, y_val_pred, average='weighted')
        f1 = f1_score(y_val, y_val_pred, average='weighted')

        # Cross-validation score
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        cv_mean = cv_scores.mean()
        cv_std = cv_scores.std()

        training_time = (datetime.now() - start_time).total_seconds()

        # Store results
        result = {
            'Model': name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1_Score': f1,
            'CV_Mean': cv_mean,
            'CV_Std': cv_std,
            'Training_Time': training_time
        }

        model_results.append(result)
        trained_models[name] = model

        print(f"  ✓ Accuracy: {accuracy:.4f}")
        print(f"  ✓ F1-Score: {f1:.4f}")
        print(f"  ✓ CV Score: {cv_mean:.4f} (±{cv_std:.4f})")
        print(f"  ✓ Training time: {training_time:.2f}s")

    except Exception as e:
        print(f"  ❌ Error training {name}: {e}")
        continue

print(f"\n✅ Successfully trained {len(trained_models)} models")


Training Random Forest...
  ❌ Error training Random Forest: could not convert string to float: 'Maximum_Load'

Training Gradient Boosting...
  ❌ Error training Gradient Boosting: could not convert string to float: 'Maximum_Load'

Training Extra Trees...
  ❌ Error training Extra Trees: could not convert string to float: 'Maximum_Load'

Training Logistic Regression...
  ❌ Error training Logistic Regression: could not convert string to float: 'Maximum_Load'

Training Decision Tree...
  ❌ Error training Decision Tree: could not convert string to float: 'Maximum_Load'

Training SVM...
  ❌ Error training SVM: could not convert string to float: 'Maximum_Load'

Training K-Nearest Neighbors...
  ❌ Error training K-Nearest Neighbors: could not convert string to float: 'Maximum_Load'

Training Naive Bayes...
  ❌ Error training Naive Bayes: could not convert string to float: 'Maximum_Load'

Training Neural Network...
  ❌ Error training Neural Network: could not convert string to float: 'Maximum_L

## 5. Model Comparison and Selection

In [9]:
# Create comprehensive results comparison
print("=== MODEL COMPARISON AND SELECTION ===")

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(model_results)
results_df = results_df.sort_values('Accuracy', ascending=False)

print("\nModel Performance Comparison (sorted by Accuracy):")
print(results_df.round(4))

# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Accuracy comparison
axes[0, 0].barh(results_df['Model'], results_df['Accuracy'], color='lightblue')
axes[0, 0].set_xlabel('Accuracy')
axes[0, 0].set_title('Model Accuracy Comparison')
axes[0, 0].set_xlim(0, 1)

# F1-Score comparison
axes[0, 1].barh(results_df['Model'], results_df['F1_Score'], color='lightcoral')
axes[0, 1].set_xlabel('F1-Score')
axes[0, 1].set_title('Model F1-Score Comparison')
axes[0, 1].set_xlim(0, 1)

# Cross-validation scores
axes[1, 0].barh(results_df['Model'], results_df['CV_Mean'], color='lightgreen')
axes[1, 0].set_xlabel('CV Mean Accuracy')
axes[1, 0].set_title('Cross-Validation Score Comparison')
axes[1, 0].set_xlim(0, 1)

# Training time
axes[1, 1].barh(results_df['Model'], results_df['Training_Time'], color='lightyellow')
axes[1, 1].set_xlabel('Training Time (seconds)')
axes[1, 1].set_title('Training Time Comparison')

plt.tight_layout()
plt.show()

# Select top performing models
top_n = min(3, len(results_df))
top_models = results_df.head(top_n)['Model'].tolist()

print(f"\n🏆 Top {top_n} performing models:")
for i, model_name in enumerate(top_models, 1):
    model_row = results_df[results_df['Model'] == model_name].iloc[0]
    print(f"{i}. {model_name}:")
    print(f"   • Accuracy: {model_row['Accuracy']:.4f}")
    print(f"   • F1-Score: {model_row['F1_Score']:.4f}")
    print(f"   • CV Score: {model_row['CV_Mean']:.4f} (±{model_row['CV_Std']:.4f})")

best_model_name = top_models[0]
best_model = trained_models[best_model_name]
print(f"\n🥇 Best performing model: {best_model_name}")

=== MODEL COMPARISON AND SELECTION ===


KeyError: 'Accuracy'

## 6. Hyperparameter Tuning for Best Models

In [None]:
# Hyperparameter tuning for top performing models
print("=== HYPERPARAMETER TUNING ===")

# Define parameter grids for top models
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    },
    'Extra Trees': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'Logistic Regression': {
        'C': [0.1, 1.0, 10.0],
        'solver': ['liblinear', 'lbfgs'],
        'penalty': ['l1', 'l2']
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
}

if XGBOOST_AVAILABLE:
    param_grids['XGBoost'] = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    }

if LIGHTGBM_AVAILABLE:
    param_grids['LightGBM'] = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    }

# Perform hyperparameter tuning for top models
tuned_models = {}
tuning_results = []

for model_name in top_models[:2]:  # Tune top 2 models to save time
    if model_name in param_grids:
        print(f"\nTuning hyperparameters for {model_name}...")
        start_time = datetime.now()

        try:
            # Get base model
            base_model = trained_models[model_name]

            # Setup GridSearch with cross-validation
            grid_search = GridSearchCV(
                base_model,
                param_grids[model_name],
                cv=3,  # Use 3-fold CV to save time
                scoring='accuracy',
                n_jobs=-1,
                verbose=0
            )

            # Fit grid search
            grid_search.fit(X_train, y_train)

            # Get best model
            best_tuned_model = grid_search.best_estimator_
            tuned_models[model_name] = best_tuned_model

            # Evaluate on validation set
            y_val_pred_tuned = best_tuned_model.predict(X_val)
            tuned_accuracy = accuracy_score(y_val, y_val_pred_tuned)
            tuned_f1 = f1_score(y_val, y_val_pred_tuned, average='weighted')

            tuning_time = (datetime.now() - start_time).total_seconds()

            # Compare with original
            original_accuracy = results_df[results_df['Model'] == model_name]['Accuracy'].iloc[0]
            improvement = tuned_accuracy - original_accuracy

            result = {
                'Model': model_name,
                'Original_Accuracy': original_accuracy,
                'Tuned_Accuracy': tuned_accuracy,
                'Improvement': improvement,
                'Tuned_F1': tuned_f1,
                'Best_Params': grid_search.best_params_,
                'Tuning_Time': tuning_time
            }

            tuning_results.append(result)

            print(f"  ✓ Original accuracy: {original_accuracy:.4f}")
            print(f"  ✓ Tuned accuracy: {tuned_accuracy:.4f}")
            print(f"  ✓ Improvement: {improvement:.4f}")
            print(f"  ✓ Best parameters: {grid_search.best_params_}")
            print(f"  ✓ Tuning time: {tuning_time:.2f}s")

        except Exception as e:
            print(f"  ❌ Error tuning {model_name}: {e}")
            continue
    else:
        print(f"\nSkipping hyperparameter tuning for {model_name} (no parameter grid defined)")
        tuned_models[model_name] = trained_models[model_name]

print(f"\n✅ Hyperparameter tuning completed for {len(tuning_results)} models")

## 7. Final Model Evaluation

In [None]:
# Select the best model (tuned if available, otherwise original)
print("=== FINAL MODEL EVALUATION ===")

# Determine the final best model
if tuning_results:
    # Find best tuned model
    best_tuned = max(tuning_results, key=lambda x: x['Tuned_Accuracy'])
    final_best_model_name = best_tuned['Model']
    final_best_model = tuned_models[final_best_model_name]
    print(f"Selected tuned model: {final_best_model_name}")
    print(f"Tuned accuracy: {best_tuned['Tuned_Accuracy']:.4f}")
else:
    # Use original best model
    final_best_model_name = best_model_name
    final_best_model = best_model
    print(f"Selected original model: {final_best_model_name}")

# Retrain on full training data and evaluate on test set
print(f"\nRetraining {final_best_model_name} on full training data...")
final_best_model.fit(X_train, y_train)

# Make predictions on test set
y_test_pred = final_best_model.predict(X_test)
y_test_pred_proba = final_best_model.predict_proba(X_test)

# Calculate final metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"\n🎯 FINAL MODEL PERFORMANCE ON TEST SET:")
print(f"Model: {final_best_model_name}")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")

# Detailed classification report
print(f"\n📊 DETAILED CLASSIFICATION REPORT:")
class_names = ['Light_Load', 'Medium_Load', 'Maximum_Load']  # Adjust based on your classes
try:
    target_classes = transformation_metadata['target_classes']
    class_names = [k for k, v in sorted(target_classes.items(), key=lambda x: x[1])]
except:
    pass

report = classification_report(y_test, y_test_pred, target_names=class_names)
print(report)

In [None]:
# Confusion Matrix Visualization
print("\n=== CONFUSION MATRIX ===")

cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names,
            yticklabels=class_names)
plt.title(f'Confusion Matrix - {final_best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Calculate per-class metrics
print("\nPer-class performance:")
for i, class_name in enumerate(class_names):
    class_mask = (y_test == i)
    class_predictions = (y_test_pred == i)

    tp = np.sum(class_mask & class_predictions)
    fp = np.sum(~class_mask & class_predictions)
    fn = np.sum(class_mask & ~class_predictions)
    tn = np.sum(~class_mask & ~class_predictions)

    class_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    class_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    class_f1 = 2 * (class_precision * class_recall) / (class_precision + class_recall) if (class_precision + class_recall) > 0 else 0

    print(f"\n{class_name} (Class {i}):")
    print(f"  Precision: {class_precision:.4f}")
    print(f"  Recall: {class_recall:.4f}")
    print(f"  F1-Score: {class_f1:.4f}")
    print(f"  Support: {np.sum(class_mask)}")

## 8. Feature Importance Analysis

In [None]:
# Feature importance analysis for the final model
print("=== FEATURE IMPORTANCE ANALYSIS ===")

if hasattr(final_best_model, 'feature_importances_'):
    # Get feature importances
    feature_importance = final_best_model.feature_importances_

    # Create DataFrame for better handling
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    }).sort_values('Importance', ascending=False)

    print(f"\nTop 15 most important features for {final_best_model_name}:")
    print(feature_importance_df.head(15))

    # Visualize feature importance
    plt.figure(figsize=(12, 8))
    top_features = feature_importance_df.head(20)

    plt.barh(range(len(top_features)), top_features['Importance'])
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Feature Importances - {final_best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

    # Save feature importance
    feature_importance_df.to_csv('feature_importance.csv', index=False)
    print("\n✓ Feature importance saved to feature_importance.csv")

elif hasattr(final_best_model, 'coef_'):
    # For linear models
    feature_coefficients = np.abs(final_best_model.coef_).mean(axis=0)

    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Coefficient_Magnitude': feature_coefficients
    }).sort_values('Coefficient_Magnitude', ascending=False)

    print(f"\nTop 15 features by coefficient magnitude for {final_best_model_name}:")
    print(feature_importance_df.head(15))

else:
    print(f"\n⚠️ {final_best_model_name} does not provide feature importance information")
    feature_importance_df = None

## 9. Model Prediction Examples

In [None]:
# Show some prediction examples
print("=== MODEL PREDICTION EXAMPLES ===")

# Select a few test samples for demonstration
sample_indices = np.random.choice(X_test.index, size=min(10, len(X_test)), replace=False)
sample_X = X_test.loc[sample_indices]
sample_y_true = y_test.loc[sample_indices]
sample_y_pred = final_best_model.predict(sample_X)
sample_y_pred_proba = final_best_model.predict_proba(sample_X)

print(f"\nPrediction examples from test set:")
print("" + "="*80 + "")

for i, idx in enumerate(sample_indices):
    true_class = sample_y_true.loc[idx]
    pred_class = sample_y_pred[i]
    probabilities = sample_y_pred_proba[i]

    true_class_name = class_names[true_class] if true_class < len(class_names) else f"Class_{true_class}"
    pred_class_name = class_names[pred_class] if pred_class < len(class_names) else f"Class_{pred_class}"

    print(f"\nSample {i+1} (Index: {idx}):")
    print(f"  True Class: {true_class_name}")
    print(f"  Predicted Class: {pred_class_name}")
    print(f"  Prediction Confidence: {probabilities[pred_class]:.4f}")
    print(f"  All Probabilities: {[f'{class_names[j]}: {prob:.4f}' for j, prob in enumerate(probabilities)]}")

    if true_class == pred_class:
        print(f"  ✅ Correct Prediction")
    else:
        print(f"  ❌ Incorrect Prediction")

print("" + "="*80 + "")

# Summary statistics
correct_predictions = np.sum(sample_y_true == sample_y_pred)
print(f"\nSample accuracy: {correct_predictions}/{len(sample_indices)} ({correct_predictions/len(sample_indices)*100:.1f}%)")

## 10. Save Final Model and Results

In [None]:
# Save the final trained model and results
print("=== SAVING FINAL MODEL AND RESULTS ===")

try:
    # Save the final model
    model_filename = f'best_model_{final_best_model_name.replace(" ", "_").lower()}.pkl'
    joblib.dump(final_best_model, model_filename)
    print(f"✓ Final model saved as: {model_filename}")

    # Save model metadata
    model_metadata = {
        'model_name': final_best_model_name,
        'model_type': type(final_best_model).__name__,
        'training_date': datetime.now().isoformat(),
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f1_score': test_f1,
        'feature_count': len(feature_names),
        'training_samples': len(X_train),
        'test_samples': len(X_test),
        'class_names': class_names,
        'target_classes': transformation_metadata.get('target_classes', {}),
        'model_parameters': final_best_model.get_params() if hasattr(final_best_model, 'get_params') else {}
    }

    joblib.dump(model_metadata, 'model_metadata.pkl')
    print("✓ Model metadata saved as: model_metadata.pkl")

    # Save comprehensive results
    comprehensive_results = {
        'all_model_results': results_df,
        'tuning_results': tuning_results,
        'final_model_performance': {
            'accuracy': test_accuracy,
            'precision': test_precision,
            'recall': test_recall,
            'f1_score': test_f1
        },
        'confusion_matrix': cm.tolist(),
        'classification_report': classification_report(y_test, y_test_pred,
                                                     target_names=class_names,
                                                     output_dict=True),
        'feature_importance': feature_importance_df.to_dict() if feature_importance_df is not None else None
    }

    joblib.dump(comprehensive_results, 'comprehensive_results.pkl')
    print("✓ Comprehensive results saved as: comprehensive_results.pkl")

    # Save predictions for analysis
    predictions_df = pd.DataFrame({
        'y_true': y_test,
        'y_pred': y_test_pred,
        'correct': (y_test == y_test_pred)
    })

    # Add probability columns
    for i, class_name in enumerate(class_names):
        predictions_df[f'prob_{class_name}'] = y_test_pred_proba[:, i]

    predictions_df.to_csv('test_predictions.csv', index=False)
    print("✓ Test predictions saved as: test_predictions.csv")

except Exception as e:
    print(f"❌ Error saving model and results: {e}")

print("\n📁 Saved files:")
saved_files = [
    model_filename,
    'model_metadata.pkl',
    'comprehensive_results.pkl',
    'test_predictions.csv'
]

if feature_importance_df is not None:
    saved_files.append('feature_importance.csv')

for file in saved_files:
    print(f"  • {file}")

## 11. Model Training Summary

In [None]:
# Generate comprehensive model training summary
print("" + "="*60 + "")
print("            MODEL TRAINING SUMMARY")
print("" + "="*60 + "")

print(f"📊 Dataset Information:")
print(f"   • Total samples: {len(df):,}")
print(f"   • Features: {len(feature_names)}")
print(f"   • Classes: {len(class_names)} ({', '.join(class_names)})")
print(f"   • Training samples: {len(X_train):,} ({len(X_train)/len(df)*100:.1f}%)")
print(f"   • Test samples: {len(X_test):,} ({len(X_test)/len(df)*100:.1f}%)")

print(f"\n🤖 Models Evaluated:")
print(f"   • Total models trained: {len(results_df)}")
print(f"   • Models with hyperparameter tuning: {len(tuning_results)}")
print(f"   • Best performing model: {final_best_model_name}")

print(f"\n📈 Model Performance Ranking:")
for i, (_, row) in enumerate(results_df.head(5).iterrows(), 1):
    print(f"   {i}. {row['Model']}: {row['Accuracy']:.4f} accuracy, {row['F1_Score']:.4f} F1")

print(f"\n🎯 Final Model Performance:")
print(f"   • Model: {final_best_model_name}")
print(f"   • Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"   • Test Precision: {test_precision:.4f}")
print(f"   • Test Recall: {test_recall:.4f}")
print(f"   • Test F1-Score: {test_f1:.4f}")

print(f"\n📊 Per-Class Performance:")
class_report_dict = classification_report(y_test, y_test_pred, target_names=class_names, output_dict=True)
for class_name in class_names:
    if class_name in class_report_dict:
        metrics = class_report_dict[class_name]
        print(f"   • {class_name}:")
        print(f"     - Precision: {metrics['precision']:.4f}")
        print(f"     - Recall: {metrics['recall']:.4f}")
        print(f"     - F1-Score: {metrics['f1-score']:.4f}")
        print(f"     - Support: {metrics['support']}")

if feature_importance_df is not None:
    print(f"\n🔝 Top 5 Most Important Features:")
    for i, (_, row) in enumerate(feature_importance_df.head(5).iterrows(), 1):
        importance_col = 'Importance' if 'Importance' in row else 'Coefficient_Magnitude'
        print(f"   {i}. {row['Feature']}: {row[importance_col]:.4f}")

print(f"\n💾 Saved Artifacts:")
for file in saved_files:
    print(f"   • {file}")

print(f"\n⏱️ Training Summary:")
total_training_time = sum([r['Training_Time'] for r in model_results])
if tuning_results:
    total_tuning_time = sum([r['Tuning_Time'] for r in tuning_results])
    print(f"   • Total training time: {total_training_time:.2f}s")
    print(f"   • Total tuning time: {total_tuning_time:.2f}s")
    print(f"   • Total time: {total_training_time + total_tuning_time:.2f}s")
else:
    print(f"   • Total training time: {total_training_time:.2f}s")

print(f"\n🎯 Model Readiness:")
if test_accuracy >= 0.9:
    readiness = "🌟 Excellent - Ready for Production"
elif test_accuracy >= 0.8:
    readiness = "✅ Good - Ready for Deployment"
elif test_accuracy >= 0.7:
    readiness = "⚠️ Acceptable - Consider Further Tuning"
else:
    readiness = "❌ Poor - Needs Significant Improvement"

print(f"   • Status: {readiness}")
print(f"   • Model file: {model_filename}")
print(f"   • Ready for inference: ✅")

print("" + "="*60 + "")
print("Model Training Phase Completed Successfully!")
print(f"Timestamp: {datetime.now()}")
print(f"🏆 Best Model: {final_best_model_name} with {test_accuracy:.4f} accuracy")
print("" + "="*60 + "")

print("\n🚀 Next Steps:")
print("   1. Load the saved model for inference")
print("   2. Monitor model performance in production")
print("   3. Retrain periodically with new data")
print("   4. Consider ensemble methods for improved performance")