In [8]:
# ==========================================
# 1. CÀI ĐẶT & IMPORT
# ==========================================
!pip install -q osmnx torch_geometric

import os
import shutil
import gc
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
import torch.nn.functional as F
from torch_geometric.data import InMemoryDataset, download_url, Data
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, f1_score, roc_auc_score, average_precision_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
from tensorflow.keras import Model, layers, Input
from tensorflow.keras.layers import Dense, Dropout, Lambda, Reshape, BatchNormalization, Concatenate, Layer, Multiply

# Thiết lập Random Seed
seed = 7
np.random.seed(seed)
tf.random.set_seed(seed)
torch.manual_seed(seed)

print(f"TensorFlow Version: {tf.__version__}")

# ==========================================
# 2. DATASET LOADER
# ==========================================
def parse_npz(f):
    crash_time = f['crash_time']
    x = torch.from_numpy(f['x']).to(torch.float)
    coords = torch.from_numpy(f['coordinates']).to(torch.float)
    edge_attr = torch.from_numpy(f['edge_attr']).to(torch.float)
    cnt_labels = torch.from_numpy(f['cnt_labels']).to(torch.long)
    occur_labels = torch.from_numpy(f['occur_labels']).to(torch.long)
    edge_attr_dir = torch.from_numpy(f['edge_attr_dir']).to(torch.float)
    edge_attr_ang = torch.from_numpy(f['edge_attr_ang']).to(torch.float)
    severity_labels = torch.from_numpy(f['severity_8labels']).to(torch.long)
    edge_index = torch.from_numpy(f['edge_index']).to(torch.long).t().contiguous()
    return Data(x=x, y=occur_labels, severity_labels=severity_labels, edge_index=edge_index,
                edge_attr=edge_attr, edge_attr_dir=edge_attr_dir, edge_attr_ang=edge_attr_ang,
                coords=coords, cnt_labels=cnt_labels, crash_time=crash_time)

def read_npz(path):
    with np.load(path, allow_pickle=True) as f:
        return parse_npz(f)

class TRAVELDataset(InMemoryDataset):
    url = 'https://github.com/baixianghuang/travel/raw/main/TAP-city/{}.npz'
    def __init__(self, root, name, transform=None, pre_transform=None):
        self.name = name.lower()
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0], weights_only=False)
    @property
    def raw_dir(self): return os.path.join(self.root, self.name, 'raw')
    @property
    def processed_dir(self): return os.path.join(self.root, self.name, 'processed')
    @property
    def raw_file_names(self): return f'{self.name}.npz'
    @property
    def processed_file_names(self): return 'data.pt'
    def download(self): download_url(self.url.format(self.name), self.raw_dir)
    def process(self):
        data = read_npz(self.raw_paths[0])
        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])

# ==========================================
# 3. MODEL MG-TAR OPTIMIZED
# ==========================================
class SparseGraphConvolution(Layer):
    def __init__(self, units, activation=None, use_bias=True, **kwargs):
        super(SparseGraphConvolution, self).__init__(**kwargs)
        self.units = units
        self.activation = tf.keras.activations.get(activation)
        self.use_bias = use_bias
    def build(self, input_shape):
        feature_shape = input_shape[0]
        input_dim = feature_shape[-1]
        self.kernel = self.add_weight(shape=(input_dim, self.units), initializer='glorot_uniform', name='kernel')
        if self.use_bias: self.bias = self.add_weight(shape=(self.units,), initializer='zeros', name='bias')
        super(SparseGraphConvolution, self).build(input_shape)
    def call(self, inputs):
        features, adj_sparse = inputs
        features_2d = tf.squeeze(features, axis=0)
        shape = tf.shape(adj_sparse); N = shape[1]
        adj_2d = tf.sparse.reshape(adj_sparse, [N, N])
        support = tf.matmul(features_2d, self.kernel)
        output = tf.sparse.sparse_dense_matmul(adj_2d, support)
        output = tf.expand_dims(output, axis=0)
        if self.use_bias: output = tf.nn.bias_add(output, self.bias)
        if self.activation is not None: output = self.activation(output)
        return output

class GatedFusionLayer(Layer):
    def __init__(self, units, **kwargs):
        super(GatedFusionLayer, self).__init__(**kwargs)
        self.units = units
        self.gate_proj = Dense(units, activation='sigmoid')
    def call(self, inputs):
        H_view, z_global = inputs
        gate = self.gate_proj(z_global)
        return Multiply()([H_view, gate])

def MG_TAR_Optimized(input_shape, n_districts, configs, length=1):
    tf.keras.backend.clear_session()
    gru_h, gcn_f, fc_h, n_layers, bn, d = configs
    n_features = input_shape[-1]
    inputs_adj = [Input(shape=(n_districts, n_districts), sparse=True, name=f'A_{i}') for i in range(5)]
    F = Input(shape=[length, n_districts, n_features], name='Features')
    Ft = Lambda(lambda f: f[:,0,:,:])(F)
    states = [Ft for _ in range(5)]
    final_embeds = []
    pooled_embeds = []
    gcn_layer_shared = SparseGraphConvolution(gcn_f, activation='relu')
    for j in range(5):
        x = gcn_layer_shared([states[j], inputs_adj[j]])
        if bn: x = BatchNormalization()(x)
        x = Dropout(0.1)(x)
        final_embeds.append(x)
        pooled_embeds.append(Lambda(lambda t: tf.reduce_mean(t, axis=1, keepdims=True))(x))
    z_global = Concatenate(axis=-1)(pooled_embeds)
    H_list = []
    for j in range(5):
        fusion_layer = GatedFusionLayer(units=gcn_f)
        H_list.append(fusion_layer([final_embeds[j], z_global]))
    H = Concatenate(axis=-1)(H_list)
    H = Concatenate(axis=-1)([H, Ft])
    H = Dense(fc_h, activation='relu')(H)
    H = Dropout(0.3)(H)
    H = Dense(32, activation='relu')(H)
    y = Dense(1, activation='sigmoid')(H)
    y = Reshape([n_districts])(y)
    return Model(inputs=inputs_adj + [F], outputs=y)

# ==========================================
# 4. UTILS & LOSS
# ==========================================
def masked_weighted_bce(mask, pos_weight):
    def loss(y_true, y_pred):
        mask_cast = tf.expand_dims(tf.cast(mask, dtype=tf.float32), 0)
        weights = y_true * (pos_weight - 1.0) + 1.0
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        return tf.reduce_sum(bce * weights * mask_cast) / (tf.reduce_sum(mask_cast) + 1e-9)
    return loss

def get_sparse_adj_expanded(edge_index, num_nodes):
    indices = edge_index.t().numpy()
    values = np.ones(indices.shape[0], dtype=np.float32)
    st = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=[num_nodes, num_nodes])
    return tf.sparse.expand_dims(tf.sparse.reorder(st), 0)

def clean_data_leakage(X, y, threshold=0.85):
    n_features = X.shape[1]
    drop_indices = [i for i in range(n_features) if not np.isnan(X[:, i]).any() and abs(np.corrcoef(X[:, i], y)[0, 1]) > threshold]
    if drop_indices:
        print(f"Removed leakage columns: {drop_indices}")
        return np.delete(X, drop_indices, axis=1)
    return X

class ValidationCallback(tf.keras.callbacks.Callback):
    def __init__(self, val_mask, inputs, y_true):
        self.val_mask, self.inputs, self.y_true = val_mask, inputs, y_true
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 5 == 0:
            y_pred = self.model.predict(self.inputs, verbose=0)
            y_true_val = self.y_true[0][self.val_mask == 1]
            y_pred_val = (y_pred[0][self.val_mask == 1] > 0.5).astype(int)
            print(f" — val_f1: {f1_score(y_true_val, y_pred_val, zero_division=0):.4f}")

# ==========================================
# 5. MAIN EXPERIMENT
# ==========================================
def run_experiment_full_integration():
    city_name = 'los_angeles_ca'
    file_path = 'exp/'
    if os.path.exists(file_path+city_name+'/processed'): shutil.rmtree(file_path+city_name+'/processed')

    print(f"Loading {city_name}...")
    dataset = TRAVELDataset(file_path, city_name)
    data = dataset[0]

    X_clean = clean_data_leakage(data.x.numpy(), data.y.numpy())
    X_scaled = MinMaxScaler().fit_transform(X_clean)
    
    adj_sparse = get_sparse_adj_expanded(data.edge_index, data.num_nodes)
    inputs_adj = [adj_sparse for _ in range(5)]
    F_input = np.expand_dims(np.expand_dims(X_scaled, 0), 1)
    y_true = np.expand_dims(data.y.numpy(), 0)

    # Split
    from sklearn.model_selection import train_test_split
    train_idx, temp_idx = train_test_split(np.arange(data.num_nodes), test_size=0.4, stratify=data.y.numpy(), random_state=seed)
    val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, stratify=data.y.numpy()[temp_idx], random_state=seed)
    
    masks = [np.zeros(data.num_nodes) for _ in range(3)]
    for i, idx in enumerate([train_idx, val_idx, test_idx]): masks[i][idx] = 1
    train_mask, val_mask, test_mask = masks

    # Weight Calculation
    n_pos = np.sum(data.y.numpy()[train_idx] == 1)
    pos_weight = np.sum(data.y.numpy()[train_idx] == 0) / (n_pos + 1e-5)
    print(f"Pos Weight: {pos_weight:.2f}")

    # Model & Train
    model = MG_TAR_Optimized(X_scaled.shape, data.num_nodes, [64, 32, 64, 2, False, 1.0])
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss=masked_weighted_bce(train_mask, pos_weight), jit_compile=False)
    
    inputs_full = inputs_adj + [F_input]
    print("\nStarting Training...")
    model.fit(inputs_full, y_true, epochs=100, batch_size=1, verbose=1, 
              callbacks=[ValidationCallback(val_mask, inputs_full, y_true), 
                         tf.keras.callbacks.EarlyStopping(monitor='loss', patience=15, restore_best_weights=True)])

    # --- EVALUATION ---
    print("\n=== FINAL EVALUATION ===")
    y_pred_prob = model.predict(inputs_full)[0]
    y_test = data.y.numpy()[test_mask == 1]
    y_prob = y_pred_prob[test_mask == 1]

    # Find Best Threshold
    best_f1, best_thresh = 0, 0.5
    for thresh in np.arange(0.1, 0.9, 0.05):
        f1 = f1_score(y_test, (y_prob > thresh).astype(int), zero_division=0)
        if f1 > best_f1: best_f1, best_thresh = f1, thresh
    
    y_pred = (y_prob > best_thresh).astype(int)
    print(f"Best Threshold: {best_thresh:.2f}")
    print(classification_report(y_test, y_pred, digits=4, zero_division=0))
    print(f"AUC: {roc_auc_score(y_test, y_prob):.4f} | MAP: {average_precision_score(y_test, y_prob):.4f}")

    # --- MAE, RMSE, PCC ---
    print("\n=== ADDITIONAL METRICS (MAE, RMSE, PCC) ===")
    
    # 1. Binary Metrics (Dựa trên nhãn 0/1 sau khi threshold)
    mae_bin = mean_absolute_error(y_test, y_pred)
    rmse_bin = np.sqrt(mean_squared_error(y_test, y_pred))
    pcc_bin, _ = pearsonr(y_test, y_pred)
    print(f"[Binary Prediction] MAE: {mae_bin:.4f} | RMSE: {rmse_bin:.4f} | PCC: {pcc_bin:.4f}")

    # 2. Probability Metrics (Dựa trên xác suất gốc)
    mae_prob = mean_absolute_error(y_test, y_prob)
    rmse_prob = np.sqrt(mean_squared_error(y_test, y_prob))
    pcc_prob, _ = pearsonr(y_test, y_prob)
    print(f"[Risk Probability]  MAE: {mae_prob:.4f} | RMSE: {rmse_prob:.4f} | PCC: {pcc_prob:.4f}")

    del model, inputs_full; gc.collect()

if __name__ == "__main__":
    run_experiment_full_integration()

TensorFlow Version: 2.19.0
Loading los_angeles_ca...


Processing...
Done!


Pos Weight: 6.69

Starting Training...
Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - loss: 1.5306 — val_f1: 0.2259
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 1.5306
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 255ms/step - loss: 1.4301
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 256ms/step - loss: 1.3448
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 259ms/step - loss: 1.2747
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 257ms/step - loss: 1.2130
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 1.1612 — val_f1: 0.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 911ms/step - loss: 1.1612
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 255ms/step - loss: 1.1176
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37