Import Main Libraries

In [11]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.losses import Huber
from tensorflow.keras.metrics import MeanAbsoluteError
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from geopy.distance import geodesic
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans




Load the data and prep the labels

In [6]:


data = pd.read_csv("2019-2021_filtered_data.csv")

data['crash_datetime_y'] = pd.to_datetime(data['crash_datetime_y'])
data['year'] = data['crash_datetime_y'].dt.year
data['month'] = data['crash_datetime_y'].dt.month
data['day'] = data['crash_datetime_y'].dt.day
data['hour'] = data['crash_datetime_y'].dt.hour
data = data.drop(columns=['crash_datetime_y'])

zero_variance_cols = data.var(axis=0) == 0
data = data.loc[:, ~zero_variance_cols]

imputer = SimpleImputer(strategy="mean")
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

scaler = RobustScaler()
numeric_columns = ['lat', 'long']
categorical_columns = [
    'crash_severity_id_x', 'vehicle_maneuver_id', 'vehicle_contrib_circum_id',
    'extent_deformity_id', 'most_damaged_area_id', 'area_init_impact_id',
    'most_harmful_event_id', 'event_sequence_1_id', 'event_sequence_2_id',
    'event_sequence_3_id', 'event_sequence_4_id', 'crash_severity_id_y',
    'first_harmful_event_id', 'roadway_contrib_circum_id'
]

lat_long_targets = data[numeric_columns]
X_data = data.drop(columns=numeric_columns)

coords = lat_long_targets[['lat', 'long']]
kmeans = KMeans(n_clusters=10, random_state=42)  # Adjust n_clusters as needed
data['cluster_id'] = kmeans.fit_predict(coords)

# Add cluster_id to feature data
X_data['cluster_id'] = data['cluster_id']

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X_data[col] = le.fit_transform(X_data[col])
    label_encoders[col] = le

X_scaled = scaler.fit_transform(X_data)
X = pd.DataFrame(X_scaled, columns=X_data.columns)

lat_long_scaler = RobustScaler()
lat_long_scaled = lat_long_scaler.fit_transform(lat_long_targets)

# Prepare final datasets
X_data = X
y_labels = {col: lat_long_scaled[:, i] for i, col in enumerate(['lat', 'long'])}

# Debugging information
print("NaNs in features:", X_data.isna().sum().sum())
print("NaNs in targets:", pd.DataFrame(lat_long_scaled).isna().sum().sum())
print("Infinities in features:", np.isinf(X_data).sum().sum())
print("Infinities in targets:", np.isinf(lat_long_scaled).sum().sum())


NaNs in features: 0
NaNs in targets: 0
Infinities in features: 0
Infinities in targets: 0


Split the data into training and test data

In [7]:

combined_data = X.copy()
for key, value in y_labels.items():
    combined_data[key] = value

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(combined_data):
    train_data = combined_data.iloc[train_index]
    test_data = combined_data.iloc[test_index]
    X_train = train_data[X.columns]
    X_test = test_data[X.columns]
    y_train = {key: train_data[key].values for key in y_labels.keys()}
    y_test = {key: test_data[key].values for key in y_labels.keys()}

Model for Lat/Lng

In [12]:

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_mse = []
fold_mae = []
fold_results = []

# Loop over each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_data)):
    print(f"Training on fold {fold + 1}/{n_splits}...")

    # Split data into training and validation sets
    X_train_fold, X_val_fold = X_data.iloc[train_idx], X_data.iloc[val_idx]
    y_train_fold = {key: y_labels[key][train_idx] for key in y_labels.keys()}
    y_val_fold = {key: y_labels[key][val_idx] for key in y_labels.keys()}

    # Define the model
    inputs = Input(shape=(X_data.shape[1],))
    x = Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.001))(inputs)
    x = Dropout(0.2)(x)
    x = Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x)
    x = Dropout(0.2)(x)
    x = Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x)

    # Separate branches for latitude and longitude
    lat_branch = Dense(25, activation='relu')(x)
    long_branch = Dense(25, activation='relu')(x)

    lat_output = Dense(1, activation='linear', name='lat')(lat_branch)
    long_output = Dense(1, activation='linear', name='long')(long_branch)

    lat_long_pred_model = Model(inputs=inputs, outputs=[lat_output, long_output])

    # Compile the model
    lat_long_pred_model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss={'lat': Huber(delta=1.0), 'long': Huber(delta=1.0)},
        metrics={'lat': 'mae', 'long': 'mae'}
    )

    # Set callbacks for training
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

    # Train the model on the current fold
    history = lat_long_pred_model.fit(
        X_train_fold,
        [y_train_fold['lat'], y_train_fold['long']],
        validation_data=(X_val_fold, [y_val_fold['lat'], y_val_fold['long']]),
        epochs=10,
        batch_size=64,
        callbacks=[early_stopping, lr_scheduler],
        verbose=1
    )

    y_pred_lat, y_pred_long = lat_long_pred_model.predict(X_val_fold)

    # Convert predictions to binary (or discrete categories as needed)
    y_pred_lat_binary = (y_pred_lat > 0.5).astype(int)  # Example threshold
    y_pred_long_binary = (y_pred_long > 0.5).astype(int)

    y_val_lat_binary = (y_val_fold['lat'] > 0.5).astype(int)  # Assuming thresholding logic for labels
    y_val_long_binary = (y_val_fold['long'] > 0.5).astype(int)

    # Calculate metrics for latitude predictions
    accuracy_lat = accuracy_score(y_val_lat_binary, y_pred_lat_binary)
    precision_lat = precision_score(y_val_lat_binary, y_pred_lat_binary)
    f1_lat = f1_score(y_val_lat_binary, y_pred_lat_binary)

    # Calculate metrics for longitude predictions
    accuracy_long = accuracy_score(y_val_long_binary, y_pred_long_binary)
    precision_long = precision_score(y_val_long_binary, y_pred_long_binary)
    f1_long = f1_score(y_val_long_binary, y_pred_long_binary)

    print(f"Fold {fold + 1} Metrics:")
    print(f"  Latitude: Accuracy={accuracy_lat:.4f}, Precision={precision_lat:.4f}, F1 Score={f1_lat:.4f}")
    print(f"  Longitude: Accuracy={accuracy_long:.4f}, Precision={precision_long:.4f}, F1 Score={f1_long:.4f}")

    # Store fold results (extend this as needed for final averages)
    fold_results.append({
        'fold': fold + 1,
        'lat_accuracy': accuracy_lat,
        'lat_precision': precision_lat,
        'lat_f1': f1_lat,
        'long_accuracy': accuracy_long,
        'long_precision': precision_long,
        'long_f1': f1_long,
    })

    # Plot training and validation loss
    plt.figure(figsize=(10, 5))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Fold {fold + 1} Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    # Plot MAE for latitude and longitude
    plt.figure(figsize=(10, 5))
    plt.plot(history.history['lat_mae'], label='Training MAE (Lat)')
    plt.plot(history.history['val_lat_mae'], label='Validation MAE (Lat)')
    plt.plot(history.history['long_mae'], label='Training MAE (Long)')
    plt.plot(history.history['val_long_mae'], label='Validation MAE (Long)')
    plt.title(f'Fold {fold + 1} MAE')
    plt.xlabel('Epochs')
    plt.ylabel('MAE')
    plt.legend()
    plt.show()

# Compute average metrics across all folds
mean_mse = np.mean(fold_mse)
mean_mae = np.mean(fold_mae)

print(f"Average MSE across {n_splits} folds: {mean_mse:.4f}")
print(f"Average MAE across {n_splits} folds: {mean_mae:.4f}")

# Boxplot for metrics across folds
metrics_df = pd.DataFrame({
    'Fold': range(1, n_splits + 1),
    'MSE': fold_mse,
    'MAE': fold_mae
})

metrics_df.boxplot(column=['MSE', 'MAE'], grid=False, figsize=(8, 6))
plt.title('Model Performance Across Folds')
plt.ylabel('Error')
plt.show()

# Print metrics table
metrics_df.loc['Average'] = ['Average', mean_mse, mean_mae]
print(metrics_df)


Training on fold 1/5...
Epoch 1/10
[1m 495/4109[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m40s[0m 11ms/step - lat_loss: 1.0835 - lat_mae: 1.4213 - long_loss: 1.4315 - long_mae: 1.7706 - loss: 2.7095

KeyboardInterrupt: 