In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
set_config(display='diagram') 
import numpy as np

In [2]:
data_df = pd.read_csv('data_final.csv')
data_df

Unnamed: 0.1,Unnamed: 0,City,Level,PM2.5,PM10,O3,NO2,SO2,CO
0,0,Dhaka,Very Unhealthy,227.5,,,,,
1,1,Delhi,Very Unhealthy,137.7,201.2,9.8,35.0,12.6,1181.7
2,2,Bishkek,Very Unhealthy,157.4,2.2,,,,
3,3,Kathmandu,Unhealthy,112.2,198.5,14.8,,,
4,4,Lahore,Unhealthy,104.5,,,,,
...,...,...,...,...,...,...,...,...,...
428,428,"Puertollano, Castilla-La Mancha",Moderate,23.0,15.0,34.0,14.0,5.0,
429,429,"Beasain, Basque Country",Moderate,22.0,,,,2.0,0.0
430,430,"Ciutat Meridiana, Catalunya",Moderate,22.0,32.0,,,,
431,431,"Arwad, Tartus",Moderate,28.7,46.3,,18.4,7.9,


In [3]:
data_df.index.duplicated().sum()

0

In [4]:
data_df['Level'].dtype

dtype('O')

In [5]:
data_df['Level'].isna().sum()

2

In [6]:
#loai bo cac dong khong co du lieu Level
data_df = data_df[data_df['Level'].notna()]
data_df['Level'].isna().sum()

0

In [7]:
temp = data_df['Level'].value_counts(normalize=True) * 100
temp

Moderate                          39.675174
Good                              27.378190
Unhealthy For Sensitive Groups    16.473318
Unhealthy                         10.672854
Very Unhealthy                     4.176334
Hazardous                          1.624130
Name: Level, dtype: float64

In [8]:
# Tách X và y
y_sr = data_df["Level"]
X_df = data_df.drop("Level", axis=1)


In [9]:
y_sr

0      Very Unhealthy
1      Very Unhealthy
2      Very Unhealthy
3           Unhealthy
4           Unhealthy
            ...      
428          Moderate
429          Moderate
430          Moderate
431          Moderate
432          Moderate
Name: Level, Length: 431, dtype: object

In [10]:
X_df

Unnamed: 0.1,Unnamed: 0,City,PM2.5,PM10,O3,NO2,SO2,CO
0,0,Dhaka,227.5,,,,,
1,1,Delhi,137.7,201.2,9.8,35.0,12.6,1181.7
2,2,Bishkek,157.4,2.2,,,,
3,3,Kathmandu,112.2,198.5,14.8,,,
4,4,Lahore,104.5,,,,,
...,...,...,...,...,...,...,...,...
428,428,"Puertollano, Castilla-La Mancha",23.0,15.0,34.0,14.0,5.0,
429,429,"Beasain, Basque Country",22.0,,,,2.0,0.0
430,430,"Ciutat Meridiana, Catalunya",22.0,32.0,,,,
431,431,"Arwad, Tartus",28.7,46.3,,18.4,7.9,


In [11]:
#Tách tập train, test và validation
trainval_X_df, test_X_df, trainval_y_sr, test_y_sr = train_test_split(X_df, y_sr, test_size=0.3, random_state=None, stratify = y_sr)
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(trainval_X_df, trainval_y_sr, test_size=0.3, random_state=None, stratify = trainval_y_sr)

In [12]:
train_X_df.shape

(210, 8)

In [13]:
train_y_sr.shape

(210,)

In [14]:
val_X_df.shape

(91, 8)

In [15]:
val_y_sr.shape

(91,)

In [16]:
train_X_df.index

Int64Index([398, 391, 256, 117, 149,  95, 215, 339, 230,  80,
            ...
            131, 334,  39, 192, 427,  44, 293, 122, 236, 419],
           dtype='int64', length=210)

In [17]:
train_X_df.dtypes

Unnamed: 0      int64
City           object
PM2.5         float64
PM10          float64
O3            float64
NO2           float64
SO2           float64
CO            float64
dtype: object

In [18]:
train_X_df.dtypes[train_X_df.dtypes != object]

Unnamed: 0      int64
PM2.5         float64
PM10          float64
O3            float64
NO2           float64
SO2           float64
CO            float64
dtype: object

In [19]:
#quan sát phân bố của các cột số
num_cols = ['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']
df = train_X_df[num_cols]
def missing_ratio(df):
    return (df.isna().mean() * 100).round(1)
def lower_quartile(df):
    return df.quantile(0.25).round(1)
def median(df):
    return df.quantile(0.5).round(1)
def upper_quartile(df):
    return df.quantile(0.75).round(1)
df.agg([missing_ratio, 'min', lower_quartile, median, upper_quartile, 'max'])



Unnamed: 0,PM2.5,PM10,O3,NO2,SO2,CO
missing_ratio,0.0,44.3,63.3,53.8,62.9,73.3
min,0.0,2.2,0.0,2.7,0.0,0.0
lower_quartile,11.0,15.8,8.0,16.0,2.6,7.4
median,25.0,30.0,20.0,28.9,5.0,345.0
upper_quartile,42.1,55.6,36.0,47.0,11.9,629.8
max,770.0,446.6,81.0,470.0,101.9,2900.0


In [20]:
#hàm tiền xử lý
def fill_missing(X_df):
    num_cols = ['PM2.5', 'PM10', 'O3', 'NO2', 'SO2', 'CO']
    df = X_df.copy()
    #thay 'nan' bằng 0.0
    df[num_cols] = X_df[num_cols].fillna(0)
    #xóa cột City vì là cột định danh không cần thiết cho mô hình
    df = df.drop(columns=['City'])
    return df

In [21]:
#sử dụng FunctionTransformer để transform tập dữ liệu
transform = FunctionTransformer(fill_missing)


In [22]:
#tạo pipeline cho tiền XL
preprocess_pipeline = make_pipeline(FunctionTransformer(fill_missing), StandardScaler())


In [23]:
#preprocess tập train
preprocess_train_X = preprocess_pipeline.fit_transform(train_X_df)
preprocess_train_X

array([[ 1.48562404e+00, -5.61258401e-01, -4.87249485e-01, ...,
        -4.06971315e-01, -3.69101540e-01, -3.45467836e-01],
       [ 1.42980064e+00, -2.07287131e-01, -4.87249485e-01, ...,
        -4.06971315e-01,  5.25849962e-01, -3.42947146e-01],
       [ 3.53206442e-01, -2.15455699e-01, -4.87249485e-01, ...,
        -4.06971315e-01, -3.69101540e-01, -3.45467836e-01],
       ...,
       [-7.15412984e-01,  3.62785543e+00,  6.74892733e+00, ...,
         4.57801550e-01,  1.33556799e+00, -3.45467836e-01],
       [ 1.93711006e-01, -4.97271287e-01, -3.73829786e-01, ...,
         4.35419193e-01,  7.41125371e-02, -3.45467836e-01],
       [ 1.65309425e+00, -2.11371415e-01, -1.16505950e-03, ...,
        -4.06971315e-01, -3.69101540e-01, -3.45467836e-01]])

In [24]:
#preprocess tập validation
preprocessed_val_X = preprocess_pipeline.transform(val_X_df)
preprocessed_val_X.shape

(91, 7)

In [25]:
train_y_sr.value_counts()

Moderate                          83
Good                              57
Unhealthy For Sensitive Groups    35
Unhealthy                         22
Very Unhealthy                     9
Hazardous                          4
Name: Level, dtype: int64

In [26]:
#chuyển ouput thành các cấp độ để dễ nhiều 
def transform_y(train_y_sr):
    new_y = []
    for index, value in train_y_sr.items():
        if value == 'Good':
            new_y.append(0)
        elif value == 'Moderate':
            new_y.append(1)
        elif value =='Unhealthy For Sensitive Groups':
            new_y.append(2)
        elif value == 'Unhealthy':
            new_y.append(3)
        elif value == 'Very Unhealthy':
            new_y.append(4)
        else:
            new_y.append(5)
    new_y_sr = pd.Series(new_y)
    return new_y_sr

In [27]:
new_train_y_sr = transform_y(train_y_sr)
new_val_y_sr = transform_y(val_y_sr)
new_test_y_sr = transform_y(test_y_sr)

In [28]:
neural_net = MLPClassifier(hidden_layer_sizes=(20,), activation='tanh', solver='lbfgs', max_iter=2500, random_state = 0)
full_pipeline = make_pipeline(preprocess_pipeline, neural_net)

In [29]:
# Thử nghiệm với các giá trị khác nhau của các siêu tham số
# và chọn ra các giá trị tốt nhất
train_errs = []
val_errs = []
alphas = [0.00001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
best_val_err = float('inf'); 
best_alpha = None;
li_hidden_layer_sizes = [5, 10, 15, 20, 25, 30]
best_hidden_layer_size = None
li = []

for alpha in alphas:
    for x in li_hidden_layer_sizes:
        full_pipeline.set_params(mlpclassifier__alpha=alpha, mlpclassifier__hidden_layer_sizes=x)
        full_pipeline.fit(train_X_df, new_train_y_sr)
        train_err = (1-full_pipeline.score(train_X_df, new_train_y_sr))*100
        train_errs.append(train_err)
        val_err = (1-full_pipeline.score(val_X_df, new_val_y_sr))*100
        val_errs.append(val_err)
        li.append([alpha, x])
    
best_val_err = min(val_errs)
best_alpha = li[val_errs.index(best_val_err)][0]
best_hidden_layer_size = li[val_errs.index(best_val_err)][1]
'Finish!'

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


'Finish!'

In [30]:
best_alpha

0.1

In [31]:
best_hidden_layer_size

20

In [32]:
#sử dụng best_alpha cho mô hình
full_pipeline.set_params(mlpclassifier__alpha=best_alpha, mlpclassifier__hidden_layer_sizes=best_hidden_layer_size)

In [33]:
full_pipeline.fit(train_X_df, new_train_y_sr)

In [34]:
full_pipeline.score(test_X_df, new_test_y_sr)

0.8846153846153846

In [35]:
#độ lỗi
(1 - full_pipeline.score(test_X_df, new_test_y_sr))*100

11.538461538461542