In [None]:
# Comprehensive, error-free, and best-practice code ready to run in Google Colab

# Step 1: Install necessary libraries
!pip install pandas numpy scikit-learn xgboost

# Step 2: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

# Load your dataset
# df = pd.read_csv("your_dataset.csv")  # Replace this line by uploading your dataset to Colab

# Assuming dataset is already loaded as df

# Feature selection
features = ['Brand', 'Model', 'Year', 'Damaged_Part', 'Damage_Severity']
target = 'Damage_Cost_MAD'

# Split dataset into features and target
X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline for categorical features
categorical_features = ['Brand', 'Model', 'Damaged_Part']
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough'
)

# Transform data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Train an optimized XGBoost regressor
model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train_encoded, y_train)

# Make predictions
y_pred = model.predict(X_test_encoded)

# Evaluate model performance
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred):.2f}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred):.2f}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
print(f"R² Score: {r2_score(y_test, y_pred):.2f}")

# Example prediction
example_input = pd.DataFrame({
    'Brand': ['Peugeot'],
    'Model': ['208'],
    'Year': [2019],
    'Damaged_Part': ['Pare-brise arriere'],
    'Damage_Severity': [2]
})

example_input_encoded = preprocessor.transform(example_input)
example_pred = model.predict(example_input_encoded)

print(f"Predicted Damage Cost (MAD) for {example_input['Damaged_Part'].values[0]} with severity {example_input['Damage_Severity'].iloc[0]}: {example_pred[0]:.2f} MAD")


Mean Absolute Error (MAE): 134.07
Mean Squared Error (MSE): 51768.48
Root Mean Squared Error (RMSE): 227.53
R² Score: 0.97
Predicted Damage Cost (MAD) for Pare-brise arriere with severity 2: 1459.44 MAD


In [20]:
# Import libraries
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Random Forest model
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train_encoded, y_train)
rf_pred = rf_model.predict(X_test_encoded)

# Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42)
gb_model.fit(X_train_encoded, y_train)
gb_pred = gb_model.predict(X_test_encoded)

# Function to evaluate and display performance
def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Performance:")
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.2f}")
    print(f"R²: {r2_score(y_true, y_pred):.4f}")

# Evaluate all three models
evaluate_model("XGBoost", y_test, y_pred)
evaluate_model("Random Forest", y_test, rf_pred)
evaluate_model("Gradient Boosting", y_test, gb_pred)



XGBoost Performance:
MAE: 134.07
RMSE: 227.53
R²: 0.9740

Random Forest Performance:
MAE: 174.05
RMSE: 286.06
R²: 0.9589

Gradient Boosting Performance:
MAE: 132.24
RMSE: 226.71
R²: 0.9742


In [22]:
# Install packages if necessary
!pip install scikit-learn numpy pandas

# Import libraries
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 6, 8],
    'subsample': [0.8, 1.0],
}

# Initialize Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=gb_model,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_absolute_error',
                           n_jobs=-1,
                           verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train_encoded, y_train)

# Best parameters and estimator
print("Best parameters found:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate on test data
y_pred = best_model.predict(X_test_encoded)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nOptimized Gradient Boosting Performance:")
print(f"MAE: {mean_absolute_error(y_test, best_model.predict(X_test_encoded)):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, best_model.predict(X_test_encoded))):.2f}")
print(f"R²: {r2_score(y_test, best_model.predict(X_test_encoded)):.4f}")


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters found: {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}

Optimized Gradient Boosting Performance:
MAE: 128.91
RMSE: 219.93
R²: 0.9757


In [None]:
import joblib

# Save the trained model and preprocessor
joblib.dump(best_model, 'gradient_boosting_model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

print("✅ Model and preprocessor saved successfully!")


In [24]:
import joblib

# Load the trained model and preprocessor
best_model = joblib.load('gradient_boosting_model.pkl')
preprocessor = joblib.load('preprocessor.pkl')

print("✅ Model and preprocessor loaded successfully!")


✅ Model and preprocessor loaded successfully!


In [29]:
import pandas as pd

# Example: Custom input case
custom_test_case = pd.DataFrame([
    {'Brand': 'Mercedes', 'Model': 'C-Class', 'Year': 2018, 'Damaged_Part': 'Malle', 'Damage_Severity': 1}
])

# Encode input
custom_test_encoded = preprocessor.transform(custom_test_case)

# Predict cost
predicted_cost = best_model.predict(custom_test_encoded)

# Display result
print(f"🔹 Predicted Repair Cost for {custom_test_case['Brand'].values[0]} {custom_test_case['Model'].values[0]} ({custom_test_case['Damaged_Part'].values[0]}, Severity {custom_test_case['Damage_Severity'].values[0]}): {predicted_cost[0]:.2f} MAD")


🔹 Predicted Repair Cost for Mercedes C-Class (Malle, Severity 1): 993.28 MAD


In [30]:
# Define multiple test cases to check model predictions
test_cases = pd.DataFrame([
    {'Brand': 'Dacia', 'Model': 'Logan', 'Year': 2016, 'Damaged_Part': 'Pare-brise arriere', 'Damage_Severity': 2},
    {'Brand': 'Peugeot', 'Model': '208', 'Year': 2020, 'Damaged_Part': 'Porte arriere gauche', 'Damage_Severity': 3},
    {'Brand': 'Renault', 'Model': 'Clio', 'Year': 2015, 'Damaged_Part': 'Aile arriere droit', 'Damage_Severity': 1},
    {'Brand': 'Toyota', 'Model': 'Corolla', 'Year': 2019, 'Damaged_Part': 'Pare-choc arriere', 'Damage_Severity': 3},
    {'Brand': 'Volkswagen', 'Model': 'Golf', 'Year': 2018, 'Damaged_Part': 'Feu arriere gauche', 'Damage_Severity': 2},
    {'Brand': 'Ford', 'Model': 'Fiesta', 'Year': 2017, 'Damaged_Part': 'Malle', 'Damage_Severity': 1},
    {'Brand': 'Mercedes', 'Model': 'E-Class', 'Year': 2021, 'Damaged_Part': 'Plaque immatriculation arriere', 'Damage_Severity': 3},
    {'Brand': 'BMW', 'Model': 'Series 3', 'Year': 2022, 'Damaged_Part': 'Porte arriere gauche', 'Damage_Severity': 2},
    {'Brand': 'Hyundai', 'Model': 'Tucson', 'Year': 2020, 'Damaged_Part': 'Aile arriere gauche', 'Damage_Severity': 1},
    {'Brand': 'Fiat', 'Model': 'Tipo', 'Year': 2015, 'Damaged_Part': 'Feu arriere droit', 'Damage_Severity': 3}
])

# Encode input data
test_cases_encoded = preprocessor.transform(test_cases)

# Predict costs
predicted_costs = best_model.predict(test_cases_encoded)

# Add predictions to test cases
test_cases['Predicted_Damage_Cost_MAD'] = predicted_costs

# Display results
for i in range(len(test_cases)):
    print(f"🚗 {test_cases.iloc[i]['Brand']} {test_cases.iloc[i]['Model']} ({test_cases.iloc[i]['Year']})")
    print(f"   🔹 Damaged Part: {test_cases.iloc[i]['Damaged_Part']}")
    print(f"   🔹 Severity Level: {test_cases.iloc[i]['Damage_Severity']}")
    print(f"   🔹 Predicted Repair Cost: {test_cases.iloc[i]['Predicted_Damage_Cost_MAD']:.2f} MAD\n")


🚗 Dacia Logan (2016)
   🔹 Damaged Part: Pare-brise arriere
   🔹 Severity Level: 2
   🔹 Predicted Repair Cost: 979.29 MAD

🚗 Peugeot 208 (2020)
   🔹 Damaged Part: Porte arriere gauche
   🔹 Severity Level: 3
   🔹 Predicted Repair Cost: 2677.68 MAD

🚗 Renault Clio (2015)
   🔹 Damaged Part: Aile arriere droit
   🔹 Severity Level: 1
   🔹 Predicted Repair Cost: 430.30 MAD

🚗 Toyota Corolla (2019)
   🔹 Damaged Part: Pare-choc arriere
   🔹 Severity Level: 3
   🔹 Predicted Repair Cost: 3682.97 MAD

🚗 Volkswagen Golf (2018)
   🔹 Damaged Part: Feu arriere gauche
   🔹 Severity Level: 2
   🔹 Predicted Repair Cost: 1049.92 MAD

🚗 Ford Fiesta (2017)
   🔹 Damaged Part: Malle
   🔹 Severity Level: 1
   🔹 Predicted Repair Cost: 563.91 MAD

🚗 Mercedes E-Class (2021)
   🔹 Damaged Part: Plaque immatriculation arriere
   🔹 Severity Level: 3
   🔹 Predicted Repair Cost: 2046.82 MAD

🚗 BMW Series 3 (2022)
   🔹 Damaged Part: Porte arriere gauche
   🔹 Severity Level: 2
   🔹 Predicted Repair Cost: 2692.89 MAD

🚗 H