In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import GridSearchCV


# 1. IMPORT DATASET AS PANDAS DATAFRAME

In [2]:
dataset_df = pd.read_csv('csv_datasets/merged_dataset.csv')
dataset_df.columns

Index(['route_code', 'cost', 'start_location', 'end_location',
       'estimated_travel_time', 'distance', 'avg_traffic_density',
       'avg_fuel_consumption', 'historical_delays', 'weather_risk_factor',
       'time_of_day_efficiency', 'day_of_week_efficiency', 'seasonal_factors',
       'complexity_score', 'historical_incidents', 'id_x', 'vehicle_id',
       'driver_id', 'route_id', 'start_time', 'end_time', 'actual_duration',
       'expected_duration', 'start_fuel_level', 'end_fuel_level',
       'fuel_consumed', 'actual_distance', 'planned_distance', 'average_speed',
       'max_speed', 'idle_time', 'weather_conditions', 'traffic_conditions',
       'on_time_status', 'cargo_weight', 'cargo_type',
       'maintenance_issues_reported', 'driver_fatigue_score', 'id_y', 'name',
       'email', 'license_number', 'total_trips_driver', 'total_earnings',
       'safety_score', 'on_time_delivery_rate', 'experience_years',
       'rest_compliance_rate', 'avg_speed_profile', 'harsh_braking_e

# 2. FEATURE SELCTION

In [3]:
# Select relevant columns for route optimization
features = [
    'estimated_travel_time', 'distance', 'avg_traffic_density', 'avg_fuel_consumption',
    'historical_delays', 'weather_risk_factor', 'time_of_day_efficiency', 'day_of_week_efficiency',
    'seasonal_factors', 'complexity_score', 'historical_incidents'
]

# Target: optimal route could be a binary classification (optimal or not) based on real-world performance.
# Example target could be if the route was on-time or not (route optimization goal)
dataset_df['route_optimization'] = np.where(dataset_df['on_time_status'] == 1, 1, 0)

# Features and target variable
X = dataset_df[features]
y = dataset_df['route_optimization']


# 3. TRAINING - TESTING DATASET SPLITTING AND FEATURE SCALING

In [5]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

ValueError: could not convert string to float: '3 hours'

# 4. MODEL TRAINING

In [None]:
# Initialize the RandomForestClassifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# 5. MODEL EVALUATION

In [None]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Maintenance', 'Maintenance'], yticklabels=['No Maintenance', 'Maintenance'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


# 6. HYPERPARAMETER TUNING

In [None]:
# Hyperparameter tuning using GridSearchCV (optional)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Re-train the model with best parameters
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)

# Evaluate the tuned model
print(classification_report(y_test, y_pred_best))


# 7. SAVE THE MODEL

In [None]:
import joblib

# Save the model to a file
joblib.dump(best_rf_model, 'models/maintenance_model.pkl')