In [1]:
import numpy as np

data = np.load('p01_done.npy', allow_pickle=True)
print(data)

[[[     nan      nan      nan]
  [7.00e+00 4.30e-03 6.50e+01]
  [5.90e+01 1.48e-02 6.93e+01]
  ...
  [     nan      nan      nan]
  [1.10e+01 6.30e-03 6.73e+01]
  [2.50e+01 1.15e-02      nan]]

 [[     nan      nan      nan]
  [1.00e+01 7.60e-03 6.49e+01]
  [5.40e+01 1.23e-02 6.94e+01]
  ...
  [     nan      nan      nan]
  [5.00e+00 2.40e-03 6.63e+01]
  [1.60e+01 6.90e-03      nan]]

 [[     nan      nan      nan]
  [1.10e+01 7.60e-03 6.50e+01]
  [5.10e+01 1.34e-02 6.83e+01]
  ...
  [     nan      nan      nan]
  [1.40e+01 7.00e-03 6.63e+01]
  [2.40e+01 9.90e-03      nan]]

 ...

 [[1.10e+01      nan      nan]
  [7.00e+00 3.40e-03 6.50e+01]
  [1.28e+02 3.18e-02 7.18e+01]
  ...
  [     nan      nan      nan]
  [2.30e+01 1.26e-02 6.71e+01]
  [2.40e+01 1.12e-02      nan]]

 [[7.00e+00      nan      nan]
  [7.00e+00 3.30e-03 6.50e+01]
  [1.48e+02 4.55e-02 6.88e+01]
  ...
  [     nan      nan      nan]
  [2.30e+01 1.18e-02 6.73e+01]
  [2.00e+01 9.80e-03      nan]]

 [[8.00e+00      nan    

In [2]:
import pandas as pd

num_time_slots, num_nodes, _ = data.shape

time_idx, node_idx = np.meshgrid(
    np.arange(num_time_slots), np.arange(num_nodes), indexing="ij"
)

# Flatten data
df = pd.DataFrame({
    "node_id": node_idx.ravel(),
    "time_slot": time_idx.ravel(),
    "traffic_volume": data[:, :, 0].ravel(),
    "occupancy_rate": data[:, :, 1].ravel(),
    "speed": data[:, :, 2].ravel()
})

print(df) # show all data


           node_id  time_slot  traffic_volume  occupancy_rate  speed
0                0          0             NaN             NaN    NaN
1                1          0             7.0          0.0043   65.0
2                2          0            59.0          0.0148   69.3
3                3          0           237.0          0.1463   69.2
4                4          0            43.0          0.0205   65.0
...            ...        ...             ...             ...    ...
151526011    16967       8927            46.0          0.0118   68.6
151526012    16968       8927             4.0             NaN    NaN
151526013    16969       8927             NaN             NaN    NaN
151526014    16970       8927            26.0          0.0136   67.0
151526015    16971       8927            24.0          0.0090    NaN

[151526016 rows x 5 columns]


In [3]:
# count rows have NaN of each column
nan_count = df.isna().sum()
nan_percentage = (nan_count / len(df)) * 100

nan_info = pd.DataFrame({
    'NaN Count': nan_count,
    'NaN Percentage (%)': nan_percentage
})

print(nan_info)

                NaN Count  NaN Percentage (%)
node_id                 0            0.000000
time_slot               0            0.000000
traffic_volume   26459359           17.461925
occupancy_rate   33404359           22.045296
speed            56761959           37.460207


In [4]:
# Drop NaN
df = df.dropna()
print(df)

           node_id  time_slot  traffic_volume  occupancy_rate  speed
1                1          0             7.0          0.0043   65.0
2                2          0            59.0          0.0148   69.3
3                3          0           237.0          0.1463   69.2
4                4          0            43.0          0.0205   65.0
5                5          0            27.0          0.0141   65.0
...            ...        ...             ...             ...    ...
151526007    16963       8927            17.0          0.0081   65.2
151526009    16965       8927           113.0          0.0193   68.4
151526010    16966       8927            92.0          0.0321   66.9
151526011    16967       8927            46.0          0.0118   68.6
151526014    16970       8927            26.0          0.0136   67.0

[94764057 rows x 5 columns]


In [6]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Extract features
features = ['traffic_volume', 'occupancy_rate', 'speed']
X = df[features]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Use elbow method to find the optimal number of clusters
inertia = []
k_range = range(1, 10)
for k in k_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_scaled)
    inertia.append(km.inertia_)
    print(f"k = {k}, inertia = {km.inertia_}")

plt.plot(k_range, inertia, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia (Distortion)")
plt.title("Elbow Method for Optimal k")
plt.show()

In [7]:
# After finding the optimal number of clusters (k=3) use it to cluster the data
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

print("Done")

Done


In [None]:
# Save the labelled data to a parquet file
df.to_parquet("traffic_data_labelled.parquet", index=False)

In [1]:
import pandas as pd
df = pd.read_parquet("traffic_data_labelled.parquet")

print(df)

          node_id  time_slot  traffic_volume  occupancy_rate  speed  cluster
0               1          0             7.0          0.0043   65.0        2
1               2          0            59.0          0.0148   69.3        2
2               3          0           237.0          0.1463   69.2        0
3               4          0            43.0          0.0205   65.0        2
4               5          0            27.0          0.0141   65.0        2
...           ...        ...             ...             ...    ...      ...
94764052    16963       8927            17.0          0.0081   65.2        2
94764053    16965       8927           113.0          0.0193   68.4        2
94764054    16966       8927            92.0          0.0321   66.9        2
94764055    16967       8927            46.0          0.0118   68.6        2
94764056    16970       8927            26.0          0.0136   67.0        2

[94764057 rows x 6 columns]


In [13]:
import seaborn as sns

# count number of samples in each cluster
print(df['cluster'].value_counts())

# Choose randomly 100000 samples to plot, because plot all the dataset is too slow
random_samples = df.sample(n=100000)

sns.pairplot(random_samples[features + ['cluster']], hue='cluster', palette='Set2')
plt.suptitle("KMeans Clustering Result", y=1.02)
plt.show()


cluster
2    64390657
0    26897335
1     3476065
Name: count, dtype: int64


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the dataset into features and target variable
X = df[['traffic_volume', 'occupancy_rate', 'speed']]
y = df['cluster']

# Split dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
import numpy as np

# Save the split data to a numpy file

np.save("X_train.npy", X_train_scaled)
np.save("X_test.npy", X_test_scaled)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)
np.save("X_train_scaled.npy", X_train_scaled)
np.save("X_test_scaled.npy", X_test_scaled)

In [1]:
import numpy as np

X_train = np.load("X_train.npy")
X_test = np.load("X_test.npy")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")
X_train_scaled = np.load("X_train_scaled.npy")
X_test_scaled = np.load("X_test_scaled.npy")

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Create a logistic regression model for multi-class classification
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[ 5360644     7402     9432]
 [    3029   690002     3453]
 [   12470      235 12866145]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   5377478
           1       0.99      0.99      0.99    696484
           2       1.00      1.00      1.00  12878850

    accuracy                           1.00  18952812
   macro avg       1.00      1.00      1.00  18952812
weighted avg       1.00      1.00      1.00  18952812



In [22]:
from sklearn.metrics import classification_report, confusion_matrix
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# Create a LightGBM model for multi-class classification
model = LGBMClassifier(
    objective='multiclass',
    num_class=3,
    n_estimators=100,
    random_state=42
)

# Train the model with early stopping and logging, use original data, not scaled data
model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='multi_logloss',
    callbacks=[early_stopping(stopping_rounds=10), log_evaluation(10)]
)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.232426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 75811245, number of used features: 3
[LightGBM] [Info] Start training from score -1.259271
[LightGBM] [Info] Start training from score -3.305946
[LightGBM] [Info] Start training from score -0.386436
Training until validation scores don't improve for 10 rounds
[10]	valid_0's multi_logloss: 0.151036
[20]	valid_0's multi_logloss: 0.0503812
[30]	valid_0's multi_logloss: 0.0226845
[40]	valid_0's multi_logloss: 0.0137958
[50]	valid_0's multi_logloss: 0.010383
[60]	valid_0's multi_logloss: 0.00875281
[70]	valid_0's multi_logloss: 0.00778902
[80]	valid_0's multi_logloss: 0.00713582
[90]	valid_0's multi_logloss: 0.00666997
[100]	valid_0's multi_logloss: 0.00630132
Did not meet early 

In [4]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create SGDClassifier model for multi-class classification
sgd_svm_model = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, random_state=42)

# Train the model with scaled data
sgd_svm_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_sgd = sgd_svm_model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_sgd))
print("\nClassification Report:\n", classification_report(y_test, y_pred_sgd))

Confusion Matrix:
 [[ 5213289    18533   145656]
 [   42273   651458     2753]
 [   36369        0 12842481]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98   5377478
           1       0.97      0.94      0.95    696484
           2       0.99      1.00      0.99  12878850

    accuracy                           0.99  18952812
   macro avg       0.98      0.97      0.97  18952812
weighted avg       0.99      0.99      0.99  18952812



In [6]:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix

# Convert y_train, y_test to one-hot encode
y_train_cat = to_categorical(y_train, num_classes=3)
y_test_cat = to_categorical(y_test, num_classes=3)

# Initial ANN model
model = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

# Compile model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train model
history = model.fit(
    X_train_scaled, y_train_cat,
    epochs=10,
    batch_size=1024,
    validation_data=(X_test_scaled, y_test_cat),
    verbose=2
)

# Predict on the test set
y_pred_prob = model.predict(X_test_scaled)
y_pred = tf.argmax(y_pred_prob, axis=1).numpy()


print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Epoch 1/10
74035/74035 - 192s - 3ms/step - accuracy: 0.9992 - loss: 0.0030 - val_accuracy: 0.9997 - val_loss: 9.1170e-04
Epoch 2/10
74035/74035 - 186s - 3ms/step - accuracy: 0.9997 - loss: 0.0015 - val_accuracy: 0.9996 - val_loss: 0.0010
Epoch 3/10
74035/74035 - 182s - 2ms/step - accuracy: 0.9997 - loss: 0.0014 - val_accuracy: 0.9996 - val_loss: 8.3631e-04
Epoch 4/10
74035/74035 - 185s - 3ms/step - accuracy: 0.9998 - loss: 0.0013 - val_accuracy: 0.9998 - val_loss: 7.6260e-04
Epoch 5/10
74035/74035 - 187s - 3ms/step - accuracy: 0.9998 - loss: 0.0012 - val_accuracy: 0.9998 - val_loss: 5.7382e-04
Epoch 6/10
74035/74035 - 181s - 2ms/step - accuracy: 0.9998 - loss: 0.0012 - val_accuracy: 0.9998 - val_loss: 6.3858e-04
Epoch 7/10
74035/74035 - 185s - 3ms/step - accuracy: 0.9998 - loss: 0.0011 - val_accuracy: 0.9998 - val_loss: 6.6533e-04
Epoch 8/10
74035/74035 - 186s - 3ms/step - accuracy: 0.9998 - loss: 0.0011 - val_accuracy: 0.9998 - val_loss: 6.0947e-04
Epoch 9/10
74035/74035 - 188s - 3ms/