In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score, 
                           precision_recall_fscore_support)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils.class_weight import compute_class_weight
from scipy.sparse import hstack, csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Create output folder
OUTPUT_DIR = "model_training_output"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

print("="*80)
print("VMS MODEL TRAINING")
print("="*80)

# Load data
print("\n1. LOADING DATA")
print("-" * 40)
try:
    df = pd.read_excel('data/Cleaned_ServiceRequest.xlsx')
    print(f"✓ Loaded data with shape: {df.shape}")
except FileNotFoundError:
    try:
        df = pd.read_excel('data/Cleaned_ServiceRequest.xlsx')
        print(f"✓ Loaded data with shape: {df.shape}")
    except FileNotFoundError:
        print("✗ Error: Could not find data file")
        exit(1)

VMS MODEL TRAINING

1. LOADING DATA
----------------------------------------
✓ Loaded data with shape: (511645, 37)


In [7]:
# Define important features
core_features = [
    'Priority', 'service_count',
    'Building_encoded', 'Vehicle_encoded', 'Status_encoded', 'MrType_encoded'
]

# Add time features
time_features = ['request_day_of_week', 'request_month', 'request_hour']
high_impact_features = ['response_days', 'Odometer']

# Build feature list
features = [f for f in core_features if f in df.columns]
features.extend([f for f in time_features if f in df.columns])
features.extend([f for f in high_impact_features if f in df.columns])

text_feature = 'Description' if 'Description' in df.columns else None
target = 'maintenance_category'

print(f"Using features: {features}")
print(f"Text feature: {text_feature}")

Using features: ['Priority', 'service_count', 'Building_encoded', 'Vehicle_encoded', 'Status_encoded', 'MrType_encoded', 'request_day_of_week', 'request_month', 'request_hour', 'response_days', 'Odometer']
Text feature: Description


In [8]:
# Check data
if target not in df.columns:
    print(f"✗ Error: Target variable '{target}' not found")
    exit(1)

df_clean = df.dropna(subset=[target]).copy()
print(f"Dataset shape after cleaning: {df_clean.shape}")

Dataset shape after cleaning: (511645, 37)


In [9]:
# Add useful features
print("\n2. FEATURE ENGINEERING")
print("-" * 40)

def create_features(df):
    """Add important features"""
    df_enhanced = df.copy()

    # Weekend
    if 'request_day_of_week' in df.columns:
        df_enhanced['is_weekend'] = (df['request_day_of_week'] >= 5).astype(int)

    # Business hours
    if 'request_hour' in df.columns:
        df_enhanced['is_business_hours'] = ((df['request_hour'] >= 8) &
                                          (df['request_hour'] <= 17)).astype(int)

    # High maintenance vehicle
    if 'service_count' in df.columns:
        service_threshold = df['service_count'].quantile(0.75)
        df_enhanced['high_maintenance_vehicle'] = (df['service_count'] >= service_threshold).astype(int)

    return df_enhanced

df_enhanced = create_features(df_clean)


2. FEATURE ENGINEERING
----------------------------------------


In [10]:
# Add new features to list
new_features = ['is_weekend', 'is_business_hours', 'high_maintenance_vehicle']
new_features = [f for f in new_features if f in df_enhanced.columns]
features.extend(new_features)

print(f"✓ Added {len(new_features)} new features")
print(f"Total features: {len(features)}")

print("\n3. DATA PREPROCESSING")
print("-" * 40)

✓ Added 3 new features
Total features: 14

3. DATA PREPROCESSING
----------------------------------------


In [11]:
# Split features by type
numerical_features = []
categorical_features = []

for f in features:
    if df_enhanced[f].dtype in ['int64', 'float64']:
        numerical_features.append(f)
    else:
        categorical_features.append(f)

print(f"Numerical: {len(numerical_features)}, Categorical: {len(categorical_features)}")

Numerical: 14, Categorical: 0


In [12]:
# Prepare data
X_numerical = df_enhanced[numerical_features] if numerical_features else pd.DataFrame()
X_categorical = df_enhanced[categorical_features] if categorical_features else pd.DataFrame()
X_text = df_enhanced[text_feature] if text_feature else pd.Series('', index=df_enhanced.index)
y = df_enhanced[target]

# Encode target
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

# Split data
X_train_idx, X_test_idx = train_test_split(
    range(len(df_enhanced)),
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print(f"Training: {len(X_train_idx):,}, Test: {len(X_test_idx):,}")

Training: 409,316, Test: 102,329


In [13]:
# Process features
processed_features = []
feature_names = []

# Numbers
if not X_numerical.empty:
    print("Processing numerical features...")
    numerical_imputer = SimpleImputer(strategy='median')
    numerical_scaler = StandardScaler()

    X_num_train = numerical_imputer.fit_transform(X_numerical.iloc[X_train_idx])
    X_num_test = numerical_imputer.transform(X_numerical.iloc[X_test_idx])

    X_num_train = numerical_scaler.fit_transform(X_num_train)
    X_num_test = numerical_scaler.transform(X_num_test)

    processed_features.append(('numerical', X_num_train, X_num_test))
    feature_names.extend(numerical_features)

# Categories
if not X_categorical.empty:
    print("Processing categorical features...")
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    X_cat_train = categorical_imputer.fit_transform(X_categorical.iloc[X_train_idx])
    X_cat_test = categorical_imputer.transform(X_categorical.iloc[X_test_idx])

    processed_features.append(('categorical', X_cat_train, X_cat_test))
    feature_names.extend(categorical_features)

# Text
tfidf = None
if text_feature and text_feature in df_enhanced.columns:
    print("Processing text features...")
    X_text_clean = X_text.fillna('').astype(str)

    # TF-IDF for text
    tfidf = TfidfVectorizer(
        max_features=100,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )

    X_text_train = tfidf.fit_transform(X_text_clean.iloc[X_train_idx])
    X_text_test = tfidf.transform(X_text_clean.iloc[X_test_idx])

    processed_features.append(('text', X_text_train, X_text_test))
    text_feature_names = [f'text_{f}' for f in tfidf.get_feature_names_out()]
    feature_names.extend(text_feature_names)

# Combine features
if processed_features:
    train_matrices = []
    test_matrices = []

    for feature_type, train_data, test_data in processed_features:
        train_matrices.append(csr_matrix(train_data))
        test_matrices.append(csr_matrix(test_data))

    X_train_combined = hstack(train_matrices)
    X_test_combined = hstack(test_matrices)
    y_train = y_encoded[X_train_idx]
    y_test = y_encoded[X_test_idx]
else:
    print("✗ Error: No features to process!")
    exit(1)

# Feature selection
if X_train_combined.shape[1] > 50:
    print("Applying feature selection...")
    selector = SelectKBest(f_classif, k=min(50, X_train_combined.shape[1]))
    X_train_final = selector.fit_transform(X_train_combined, y_train)
    X_test_final = selector.transform(X_test_combined)

    selected_indices = selector.get_support()
    final_feature_names = [feature_names[i] for i in range(len(feature_names)) if i < len(selected_indices) and selected_indices[i]]
    print(f"  ✓ Selected {X_train_final.shape[1]} features")
else:
    X_train_final = X_train_combined
    X_test_final = X_test_combined
    final_feature_names = feature_names

print(f"Final feature matrix: {X_train_final.shape}")

print("\n4. MODEL TRAINING")
print("-" * 40)

Processing numerical features...
Processing text features...
Applying feature selection...
  ✓ Selected 50 features
Final feature matrix: (409316, 50)

4. MODEL TRAINING
----------------------------------------


In [14]:
# Class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

In [15]:
# Train model
print("Training Gradient Boosting model...")
start_time = time.time()

# Gradient Boosting model
gb_model = GradientBoostingClassifier(
    n_estimators=80,
    learning_rate=0.1,
    max_depth=8,
    min_samples_split=5,
    min_samples_leaf=2,
    subsample=0.8,
    random_state=42,
    validation_fraction=0.1,
    n_iter_no_change=10
)

# Train model
gb_model.fit(X_train_final, y_train)
training_time = time.time() - start_time

Training Gradient Boosting model...


In [16]:
# Test model
y_pred = gb_model.predict(X_test_final)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

print(f"✓ Training completed in {training_time:.2f}s")
print(f"✓ Test Accuracy: {accuracy:.3f}")
print(f"✓ Precision: {precision:.3f}")
print(f"✓ Recall: {recall:.3f}")
print(f"✓ F1-Score: {f1:.3f}")

✓ Training completed in 203.23s
✓ Test Accuracy: 0.915
✓ Precision: 0.918
✓ Recall: 0.915
✓ F1-Score: 0.912


In [17]:
# Use the trained model directly
final_model = gb_model
final_predictions = y_pred
final_accuracy = accuracy

print("\n5. EVALUATION AND SAVING")
print("-" * 40)

# Show results
target_names = le_target.classes_
class_report = classification_report(y_test, final_predictions, target_names=target_names)
print("Classification Report:")
print(class_report)


5. EVALUATION AND SAVING
----------------------------------------
Classification Report:
              precision    recall  f1-score   support

  air_system       0.90      0.82      0.86      5336
        body       0.81      0.75      0.78      1001
brake_system       0.92      0.79      0.85      8818
    cleaning       1.00      1.00      1.00     26404
  electrical       0.97      0.89      0.93      4370
      engine       0.82      0.80      0.81      4515
   hydraulic       0.06      0.01      0.02       170
  mechanical       0.88      0.80      0.84      4301
       other       0.79      0.95      0.86     20618
     service       0.97      0.83      0.90      1715
        tire       0.98      0.99      0.99     21840
     unknown       0.85      0.49      0.62      3241

    accuracy                           0.91    102329
   macro avg       0.83      0.76      0.79    102329
weighted avg       0.92      0.91      0.91    102329



In [18]:
# Confusion matrix
cm = confusion_matrix(y_test, final_predictions)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix - Gradient Boosting')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'confusion_matrix.png'), dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved confusion matrix")

✓ Saved confusion matrix


In [19]:
# Feature importance
if hasattr(final_model, 'feature_importances_'):
    importances = final_model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'feature': final_feature_names[:len(importances)],
        'importance': importances
    }).sort_values('importance', ascending=False)

    feature_importance_df.to_csv(os.path.join(OUTPUT_DIR, 'feature_importance.csv'), index=False)

    # Plot top 15 features
    plt.figure(figsize=(10, 6))
    top_features = feature_importance_df.head(15)
    plt.barh(range(len(top_features)), top_features['importance'].values)
    plt.yticks(range(len(top_features)), top_features['feature'].values)
    plt.title('Top 15 Feature Importances')
    plt.xlabel('Importance')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'feature_importance.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("✓ Saved feature importance")

✓ Saved feature importance


In [20]:
# Save model
model_objects = {
    'final_model': final_model,
    'model_type': 'Gradient Boosting',
    'numerical_features': numerical_features,
    'categorical_features': categorical_features,
    'text_feature': text_feature,
    'feature_names': final_feature_names,
    'numerical_imputer': numerical_imputer if not X_numerical.empty else None,
    'numerical_scaler': numerical_scaler if not X_numerical.empty else None,
    'categorical_imputer': categorical_imputer if not X_categorical.empty else None,
    'tfidf': tfidf,
    'feature_selector': selector if 'selector' in locals() else None,
    'label_encoder': le_target,
    'classes': target_names,
    'model_performance': {
        'accuracy': final_accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'training_time': training_time
    },
    'training_metadata': {
        'training_date': datetime.now().isoformat(),
        'training_samples': len(X_train_idx),
        'test_samples': len(X_test_idx),
        'n_features': X_train_final.shape[1],
        'n_classes': len(target_names),
        'optimization_applied': False
    }
}

# Save model file
model_filename = os.path.join(OUTPUT_DIR, 'maintenance_prediction_model.pkl')
with open(model_filename, 'wb') as f:
    pickle.dump(model_objects, f)

print(f"✓ Saved model to: {model_filename}")

✓ Saved model to: model_training_output\maintenance_prediction_model.pkl


In [21]:
# Create summary report
summary = f"""
VMS MODEL TRAINING SUMMARY
=========================

Training Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Training Time: {training_time:.2f} seconds
Final Accuracy: {final_accuracy:.3f}

Dataset:
- Training samples: {len(X_train_idx):,}
- Test samples: {len(X_test_idx):,}
- Features used: {X_train_final.shape[1]}
- Classes: {len(target_names)}

Performance:
- Accuracy: {final_accuracy:.1%}
- Precision: {precision:.1%}
- Recall: {recall:.1%}
- F1-Score: {f1:.1%}

Top 5 Features:
"""

if hasattr(final_model, 'feature_importances_'):
    for i, (_, row) in enumerate(feature_importance_df.head(5).iterrows()):
        summary += f"  {i+1}. {row['feature']}: {row['importance']:.4f}\n"

summary += f"""
Files Generated:
- maintenance_prediction_model.pkl
- confusion_matrix.png
- feature_importance.csv
- feature_importance.png
"""

with open(os.path.join(OUTPUT_DIR, 'model_summary.txt'), 'w') as f:
    f.write(summary)

print("\n" + "="*80)
print("MODEL TRAINING COMPLETED! 🚀")
print("="*80)
print(f"🎯 Model: Gradient Boosting")
print(f"🎯 Parameters: n_estimators=80, learning_rate=0.1, max_depth=8")
print(f"🎯 Accuracy: {final_accuracy:.1%}")
print(f"🎯 Training Time: {training_time:.2f} seconds")
print(f"🎯 Features: {X_train_final.shape[1]}")
print(f"🎯 Samples: {len(X_train_idx):,}")

print(f"\n⏱️  TOTAL TRAINING TIME: {training_time:.2f} seconds")
print(f"🚀 Model ready for deployment!")
print(f"📁 Saved as: {model_filename}")


MODEL TRAINING COMPLETED! 🚀
🎯 Model: Gradient Boosting
🎯 Parameters: n_estimators=80, learning_rate=0.1, max_depth=8
🎯 Accuracy: 91.5%
🎯 Training Time: 203.23 seconds
🎯 Features: 50
🎯 Samples: 409,316

⏱️  TOTAL TRAINING TIME: 203.23 seconds
🚀 Model ready for deployment!
📁 Saved as: model_training_output\maintenance_prediction_model.pkl
