# Machine Learning Models

Training three classification models for crash prediction API:
1. **Severity Predictor** -> Classifies accidents as Fatal/Injury/Property damage
2. **Accident Type Classifier** -> Predicts crash type like collision, single-vehicle and pedestrian
3. **Location Risk Assessor** -> Determines urban vs outisde

In [1]:
# Initialize Spark and load preprocessed data
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pathlib import Path

PROCESSED_DIR = Path("../datasets/ProcessedData")

spark = SparkSession.builder.appName("CrashScope-Models").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# Load preprocessed data
train_df = spark.read.parquet(str(PROCESSED_DIR / "train.parquet"))
test_df = spark.read.parquet(str(PROCESSED_DIR / "test.parquet"))

print(f"Training samples: {train_df.count():,}")
print(f"Test samples: {test_df.count():,}")
print(f"Features: {len(train_df.select('features').first().features)}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/19 16:05:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/19 16:05:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

Training samples: 190,829
Test samples: 47,696
Features: 33


In [2]:
# Create target encoders for all prediction tasks
target_encoders = {
    'severity': StringIndexer(inputCol="verkeersongeval_afloop", outputCol="severity_label"),
    'type': StringIndexer(inputCol="aard_ongeval", outputCol="type_label"),
    'location': StringIndexer(inputCol="bebouwde_kom", outputCol="location_label")
}

# Fit encoders
fitted_encoders = {name: encoder.fit(train_df) for name, encoder in target_encoders.items()}

print("Target encoders fitted successfully")
for name, encoder in fitted_encoders.items():
    labels = encoder.labels
    print(f"{name.title()}: {len(labels)} classes")

Target encoders fitted successfully
Severity: 3 classes
Type: 10 classes
Location: 2 classes


# MODEL 1: Accident Severity Prediction

In [None]:
# Prepare severity data
train_severity = fitted_encoders['severity'].transform(train_df)
test_severity = fitted_encoders['severity'].transform(test_df)

# Train models
rf_severity = RandomForestClassifier(featuresCol="features", labelCol="severity_label", numTrees=50)
lr_severity = LogisticRegression(featuresCol="features", labelCol="severity_label", maxIter=50)

rf_severity_model = rf_severity.fit(train_severity)
lr_severity_model = lr_severity.fit(train_severity)

# Evaluate models
evaluator = MulticlassClassificationEvaluator(labelCol="severity_label", predictionCol="prediction", metricName="accuracy")

rf_severity_pred = rf_severity_model.transform(test_severity)
lr_severity_pred = lr_severity_model.transform(test_severity)

rf_accuracy = evaluator.evaluate(rf_severity_pred)
lr_accuracy = evaluator.evaluate(lr_severity_pred)

print(f"Severity Prediction Results:")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")

# Select best model
best_severity_model = rf_severity_model if rf_accuracy >= lr_accuracy else lr_severity_model
best_severity_name = "RandomForest" if rf_accuracy >= lr_accuracy else "LogisticRegression"
best_severity_accuracy = max(rf_accuracy, lr_accuracy)

print(f"Best Model: {best_severity_name} (Accuracy: {best_severity_accuracy:.4f})")

                                                                                

Severity Prediction Results:
Random Forest Accuracy: 0.9977
Logistic Regression Accuracy: 1.0000
Best Model: LogisticRegression (Accuracy: 1.0000)


# MODEL 2: Accident Type Classification  


In [None]:
# Prepare accident type data
train_type = fitted_encoders['type'].transform(train_df)
test_type = fitted_encoders['type'].transform(test_df)

# Train models
rf_type = RandomForestClassifier(featuresCol="features", labelCol="type_label", numTrees=50)
lr_type = LogisticRegression(featuresCol="features", labelCol="type_label", maxIter=50)

rf_type_model = rf_type.fit(train_type)
lr_type_model = lr_type.fit(train_type)

# Evaluate models
type_evaluator = MulticlassClassificationEvaluator(labelCol="type_label", predictionCol="prediction", metricName="accuracy")

rf_type_pred = rf_type_model.transform(test_type)
lr_type_pred = lr_type_model.transform(test_type)

rf_type_accuracy = type_evaluator.evaluate(rf_type_pred)
lr_type_accuracy = type_evaluator.evaluate(lr_type_pred)

print(f"Accident Type Prediction Results:")
print(f"Random Forest Accuracy: {rf_type_accuracy:.4f}")
print(f"Logistic Regression Accuracy: {lr_type_accuracy:.4f}")

# Select best model
best_type_model = rf_type_model if rf_type_accuracy >= lr_type_accuracy else lr_type_model
best_type_name = "RandomForest" if rf_type_accuracy >= lr_type_accuracy else "LogisticRegression"
best_type_accuracy = max(rf_type_accuracy, lr_type_accuracy)

print(f"Best Model: {best_type_name} (Accuracy: {best_type_accuracy:.4f})")

                                                                                

Accident Type Prediction Results:
Random Forest Accuracy: 0.9942
Logistic Regression Accuracy: 1.0000
Best Model: LogisticRegression (Accuracy: 1.0000)


# MODEL 3: Location Risk Assessment

In [None]:
# Prepare location data
train_location = fitted_encoders['location'].transform(train_df)
test_location = fitted_encoders['location'].transform(test_df)

# Train models
rf_location = RandomForestClassifier(featuresCol="features", labelCol="location_label", numTrees=50)
lr_location = LogisticRegression(featuresCol="features", labelCol="location_label", maxIter=50)

rf_location_model = rf_location.fit(train_location)
lr_location_model = lr_location.fit(train_location)

# Evaluate models
location_evaluator = MulticlassClassificationEvaluator(labelCol="location_label", predictionCol="prediction", metricName="accuracy")

rf_location_pred = rf_location_model.transform(test_location)
lr_location_pred = lr_location_model.transform(test_location)

rf_location_accuracy = location_evaluator.evaluate(rf_location_pred)
lr_location_accuracy = location_evaluator.evaluate(lr_location_pred)

print(f"Location Risk Assessment Results:")
print(f"Random Forest Accuracy: {rf_location_accuracy:.4f}")
print(f"Logistic Regression Accuracy: {lr_location_accuracy:.4f}")

# Select best model
best_location_model = rf_location_model if rf_location_accuracy >= lr_location_accuracy else lr_location_model
best_location_name = "RandomForest" if rf_location_accuracy >= lr_location_accuracy else "LogisticRegression"
best_location_accuracy = max(rf_location_accuracy, lr_location_accuracy)

print(f"Best Model: {best_location_name} (Accuracy: {best_location_accuracy:.4f})")

                                                                                

Location Risk Assessment Results:
Random Forest Accuracy: 1.0000
Logistic Regression Accuracy: 1.0000
Best Model: RandomForest (Accuracy: 1.0000)


---

In [6]:
# Model Performance Summary and Save
print("=== CRASHSCOPE MODEL PERFORMANCE SUMMARY ===")
print(f"Dataset: {train_df.count():,} training, {test_df.count():,} test samples")
print(f"Features: {len(train_df.select('features').first().features)}")
print("\nModel Accuracies:")
print(f"1. Severity Predictor ({best_severity_name}): {best_severity_accuracy:.4f}")
print(f"2. Accident Type ({best_type_name}): {best_type_accuracy:.4f}")
print(f"3. Location Risk ({best_location_name}): {best_location_accuracy:.4f}")

# Save models for deployment
import os
model_dir = "../models"
os.makedirs(model_dir, exist_ok=True)

best_severity_model.write().overwrite().save(f"{model_dir}/severity_model")
best_type_model.write().overwrite().save(f"{model_dir}/accident_type_model")
best_location_model.write().overwrite().save(f"{model_dir}/location_risk_model")

fitted_encoders['severity'].write().overwrite().save(f"{model_dir}/severity_indexer")
fitted_encoders['type'].write().overwrite().save(f"{model_dir}/type_indexer")
fitted_encoders['location'].write().overwrite().save(f"{model_dir}/location_indexer")

print("\nModels saved successfully to ../models/")
print("Ready for API deployment")

=== CRASHSCOPE MODEL PERFORMANCE SUMMARY ===
Dataset: 190,829 training, 47,696 test samples
Features: 33

Model Accuracies:
1. Severity Predictor (LogisticRegression): 1.0000
2. Accident Type (LogisticRegression): 1.0000
3. Location Risk (RandomForest): 1.0000

Models saved successfully to ../models/
Ready for API deployment


In [7]:
# Prediction Example
example = test_df.limit(1)
print("Sample Prediction:")
example.select("verkeersongeval_afloop", "aard_ongeval", "bebouwde_kom", 
               "weersgesteldheid", "maximum_snelheid").show()

# Apply all models
severity_pred = best_severity_model.transform(fitted_encoders['severity'].transform(example))
type_pred = best_type_model.transform(fitted_encoders['type'].transform(example))
location_pred = best_location_model.transform(fitted_encoders['location'].transform(example))

severity_result = severity_pred.select("prediction").collect()[0][0]
type_result = type_pred.select("prediction").collect()[0][0]
location_result = location_pred.select("prediction").collect()[0][0]

# Map predictions to labels
severity_labels = fitted_encoders['severity'].labels
type_labels = fitted_encoders['type'].labels
location_labels = fitted_encoders['location'].labels

print(f"Predicted Severity: {severity_labels[int(severity_result)]}")
print(f"Predicted Type: {type_labels[int(type_result)]}")
print(f"Predicted Location: {location_labels[int(location_result)]}")

# Clean up
spark.stop()
print("\nSpark session closed")

Sample Prediction:
+----------------------+------------+------------+----------------+----------------+
|verkeersongeval_afloop|aard_ongeval|bebouwde_kom|weersgesteldheid|maximum_snelheid|
+----------------------+------------+------------+----------------+----------------+
|                Letsel|   Eenzijdig|      Binnen|           Droog|            15.0|
+----------------------+------------+------------+----------------+----------------+

Predicted Severity: Letsel
Predicted Type: Eenzijdig
Predicted Location: Binnen

Spark session closed
