In [5]:
!pip install torch_geometric osmnx scipy scikit-learn numpy pandas



In [9]:
import os
import urllib.parse
import numpy as np
import pandas as pd
import os.path as osp
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from scipy.stats import pearsonr
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_absolute_error, mean_squared_error

from torch_geometric.data import Data, InMemoryDataset, download_url
from torch_geometric.nn import GCNConv
from torch_geometric.utils import to_networkx

# Thiết lập Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ---------------------------------------------------------
# 1. DATASET LOADER (ĐÃ SỬA LỖI URL)
# ---------------------------------------------------------
def read_npz(path):
    with np.load(path, allow_pickle=True) as f:
        return parse_npz(f)

def parse_npz(f):
    x = torch.from_numpy(f['x']).to(torch.float)
    occur_labels = torch.from_numpy(f['occur_labels']).to(torch.long)
    edge_index = torch.from_numpy(f['edge_index']).to(torch.long).t().contiguous()
    return Data(x=x, y=occur_labels, edge_index=edge_index)

class TRAVELDataset(InMemoryDataset):
    url = 'https://github.com/baixianghuang/travel/raw/main/TAP-city/{}.npz'

    def __init__(self, root: str, name: str, transform=None, pre_transform=None):
        self.name = name
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0], weights_only=False)

    @property
    def raw_dir(self) -> str:
        return osp.join(self.root, self.name, 'raw')

    @property
    def processed_dir(self) -> str:
        return osp.join(self.root, self.name, 'processed')

    @property
    def raw_file_names(self) -> str:
        return f'{self.name}.npz'

    @property
    def processed_file_names(self) -> str:
        return 'data.pt'

    def download(self):
        # SỬA LỖI: Sử dụng safe='()' để giữ nguyên dấu ngoặc đơn, chỉ mã hóa khoảng trắng
        encoded_name = urllib.parse.quote(self.name, safe='()')
        print(f"Downloading from: {self.url.format(encoded_name)}")
        download_url(self.url.format(encoded_name), self.raw_dir, filename=f'{self.name}.npz')

    def process(self):
        data = read_npz(self.raw_paths[0])
        if self.pre_transform is not None:
            data = self.pre_transform(data)
        torch.save(self.collate([data]), self.processed_paths[0])

def train_test_split_stratify(dataset, train_ratio=0.6, val_ratio=0.2):
    data = dataset[0]
    num_nodes = data.num_nodes
    indices = np.arange(num_nodes)
    np.random.shuffle(indices)
    
    train_end = int(train_ratio * num_nodes)
    val_end = int((train_ratio + val_ratio) * num_nodes)
    
    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)
    
    train_mask[indices[:train_end]] = True
    val_mask[indices[train_end:val_end]] = True
    test_mask[indices[val_end:]] = True
    
    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask
    return data

# ---------------------------------------------------------
# 2. MODEL DSTGCN (Adapted for PyG)
# ---------------------------------------------------------
class STBlock_PyG(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3):
        super(STBlock_PyG, self).__init__()
        self.gcn = GCNConv(in_channels, out_channels)
        self.conv1d = nn.Conv1d(out_channels, out_channels, kernel_size, padding=1)
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm1d(out_channels)

    def forward(self, x, edge_index):
        # 1. Spatial GCN
        x_s = self.gcn(x, edge_index)
        x_s = self.relu(x_s)
        x_s = self.bn(x_s)
        
        # 2. Pseudo-Temporal Conv (treat features as sequence)
        x_t = x_s.unsqueeze(0).permute(0, 2, 1) # [1, Features, Num_nodes]
        x_t = self.conv1d(x_t)
        x_t = x_t.permute(0, 2, 1).squeeze(0)   # [Num_nodes, Features]
        return x_t

class DSTGCN_PyG(nn.Module):
    def __init__(self, num_features, num_classes, hidden_dim=64):
        super(DSTGCN_PyG, self).__init__()
        self.spatial_emb = nn.Linear(num_features, hidden_dim)
        self.st_block1 = STBlock_PyG(hidden_dim, hidden_dim)
        self.st_block2 = STBlock_PyG(hidden_dim, hidden_dim)
        self.agg = nn.Linear(hidden_dim * 2, hidden_dim)
        self.external_emb = nn.Linear(num_features, hidden_dim)
        
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_dim + hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        s_emb = F.relu(self.spatial_emb(x))
        e_emb = F.relu(self.external_emb(x))
        
        st_out1 = self.st_block1(s_emb, edge_index)
        st_out2 = self.st_block2(st_out1, edge_index)
        
        st_cat = torch.cat([st_out1, st_out2], dim=1)
        t_out = self.agg(st_cat)
        
        final_emb = torch.cat([t_out, e_emb], dim=1)
        out = self.output_layer(final_emb)
        return F.log_softmax(out, dim=1)

# ---------------------------------------------------------
# 3. TRAINING & EVALUATION (MAE, RMSE, PCC)
# ---------------------------------------------------------
def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate_metrics(model, data, mask):
    model.eval()
    out = model(data)
    
    # Lấy xác suất của class 1 (Positive)
    probs = torch.exp(out)[mask][:, 1].cpu().numpy()
    preds = out[mask].max(1)[1].cpu().numpy()
    y_true = data.y[mask].cpu().numpy()
    
    # Classification Metrics
    acc = accuracy_score(y_true, preds)
    f1 = f1_score(y_true, preds, average='macro')
    try:
        auc = roc_auc_score(y_true, probs)
    except:
        auc = 0.0
        
    # Regression-style Metrics (MAE, RMSE, PCC)
    mae = mean_absolute_error(y_true, probs)
    rmse = np.sqrt(mean_squared_error(y_true, probs))
    pcc, _ = pearsonr(y_true, probs) if len(y_true) > 1 else (0, 0)
    
    return acc, f1, auc, mae, rmse, pcc

# ---------------------------------------------------------
# 4. MAIN
# ---------------------------------------------------------
def main():
    city_name = 'los_angeles_ca' 
    print(f"Loading dataset: {city_name}...")
    
    try:
        dataset = TRAVELDataset(root='data/Travel', name=city_name)
        data = train_test_split_stratify(dataset)
        data = data.to(device)
        print("Dataset loaded successfully!")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    print(f"Nodes: {data.num_nodes}, Edges: {data.num_edges}, Features: {dataset.num_features}")
    
    model = DSTGCN_PyG(num_features=dataset.num_features, num_classes=dataset.num_classes).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    
    epochs = 100
    print("\nStarting training...")
    print(f"{'Epoch':<5} | {'Loss':<10} | {'MAE':<10} | {'RMSE':<10} | {'PCC':<10} | {'ACC':<10}")
    print("-" * 65)
    
    for epoch in range(1, epochs + 1):
        loss = train(model, data, optimizer)
        if epoch % 10 == 0:
            acc, f1, auc, mae, rmse, pcc = evaluate_metrics(model, data, data.val_mask)
            print(f"{epoch:<5} | {loss:.4f}     | {mae:.4f}     | {rmse:.4f}     | {pcc:.4f}     | {acc:.4f}")

    print("\n" + "="*30)
    print("FINAL TEST RESULTS (DSTGCN)")
    print("="*30)
    acc, f1, auc, mae, rmse, pcc = evaluate_metrics(model, data, data.test_mask)
    
    print(f"Accuracy : {acc:.4f}")
    print(f"F1-Score : {f1:.4f}")
    print(f"AUC      : {auc:.4f}")
    print("-" * 20)
    print(f"MAE      : {mae:.4f}")
    print(f"RMSE     : {rmse:.4f}")
    print(f"PCC      : {pcc:.4f}")
    print("="*30)

if __name__ == "__main__":
    main()

Using device: cuda
Loading dataset: los_angeles_ca...
Downloading from: https://github.com/baixianghuang/travel/raw/main/TAP-city/los_angeles_ca.npz


Downloading https://github.com/baixianghuang/travel/raw/main/TAP-city/los_angeles_ca.npz
Processing...
Done!


Dataset loaded successfully!
Nodes: 49251, Edges: 135547, Features: 10

Starting training...
Epoch | Loss       | MAE        | RMSE       | PCC        | ACC       
-----------------------------------------------------------------
10    | 0.3091     | 0.2174     | 0.3258     | 0.2935     | 0.8724
20    | 0.2886     | 0.1958     | 0.3117     | 0.4260     | 0.8787
30    | 0.2830     | 0.1948     | 0.2985     | 0.4833     | 0.8920
40    | 0.2795     | 0.2821     | 0.3299     | 0.4499     | 0.8765
50    | 0.2771     | 0.2160     | 0.2988     | 0.4829     | 0.8889
60    | 0.2750     | 0.1357     | 0.2949     | 0.4895     | 0.8948
70    | 0.2758     | 0.1137     | 0.3162     | 0.4251     | 0.8931
80    | 0.2714     | 0.1270     | 0.2954     | 0.4915     | 0.8957
90    | 0.2779     | 0.1355     | 0.2934     | 0.4966     | 0.8961
100   | 0.2703     | 0.1801     | 0.2974     | 0.4798     | 0.8846

FINAL TEST RESULTS (DSTGCN)
Accuracy : 0.8847
F1-Score : 0.6937
AUC      : 0.8024
-----------------