In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [2]:
# 1) Load data (file output sau khi bạn xử lý df)
df = pd.read_csv("hanoi_weather_data.csv")
# basic cleaning: remove unnamed index column if present
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# ensure time column exists and is datetime
if "time" in df.columns:
    df["time"] = pd.to_datetime(df["time"], errors="coerce")

FileNotFoundError: [Errno 2] No such file or directory: 'hanoi_weather_data.csv'

In [None]:
# 2) tạo AQI category từ PM2.5 (xài rule US EPA)

def pm25_to_aqi_cat(x):
    if x <= 12: return "Good"
    elif x <= 35.4: return "Moderate"
    elif x <= 55.4: return "Unhealthy for SG"
    elif x <= 150.4: return "Unhealthy"
    else: return "Very Unhealthy"

# target
df["AQI_cat"] = df["pm2_5"].apply(pm25_to_aqi_cat)

In [None]:
# 3) chọn features (pollutants + weather) — dùng danh sách rõ ràng
feature_cols = [
    "pm10","pm2_5","carbon_monoxide","nitrogen_dioxide","ozone","sulphur_dioxide"
]
# giữ lại các cột thực sự có trong data
keep_cols = [c for c in feature_cols if c in df.columns]
# đảm bảo có cả cột AQI_cat
df = df[keep_cols + ["time","AQI_cat"]].copy()

# chuyển đổi sang số và loại bỏ hàng thiếu
df[keep_cols] = df[keep_cols].apply(pd.to_numeric, errors="coerce")
df = df.dropna(subset=["pm2_5"])  # cần pm2_5 để tạo nhãn
df = df.dropna(subset=keep_cols)

# chuẩn bị X, y
X = df[keep_cols].values
y = df["AQI_cat"].values

# encode label
le = LabelEncoder()
y = le.fit_transform(y)

# scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 4) tạo model MLP
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(len(le.classes_), activation='softmax')
])

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 5) train
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
# 6) evaluate
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print("Test accuracy:", acc)

Test accuracy: 0.9827315807342529


In [None]:
# ---- show predicted AQI category on TEST set ----

# predict on test
y_pred = model.predict(X_test)
y_pred_label = le.inverse_transform(y_pred.argmax(axis=1))

# đưa về DataFrame để dễ nhìn
df_test_result = pd.DataFrame({
    "Predicted_AQI_Category": y_pred_label
})

# in ra một số dòng xem kết quả dự đoán
print("Sample prediction on TEST set")
print(df_test_result[["Predicted_AQI_Category"]].head(20))

# nếu muốn xem accuracy nhanh
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_acc:.4f}")


Sample prediction on TEST set
   Predicted_AQI_Category
0               Unhealthy
1        Unhealthy for SG
2               Unhealthy
3        Unhealthy for SG
4                Moderate
5                Moderate
6                Moderate
7                Moderate
8                Moderate
9        Unhealthy for SG
10               Moderate
11                   Good
12               Moderate
13              Unhealthy
14              Unhealthy
15              Unhealthy
16       Unhealthy for SG
17       Unhealthy for SG
18               Moderate
19               Moderate

Test Accuracy: 0.9827
